From 4eadff52bc23db79e75c622711c000c05c9b7f19 Mon Sep 17 00:00:00 2001 From: q-rz <100142775+q-rz@users.noreply.github.com> Date: Sun, 30 Jun 2024 23:19:08 +0000 Subject: [PATCH 01/28] Add a new benchmark ENAMEL --- README.md | 1 + bigcode_eval/tasks/enamel.py | 90 ++++++++++++++++++++++++++++++++++++ 2 files changed, 91 insertions(+) create mode 100644 bigcode_eval/tasks/enamel.py diff --git a/README.md b/README.md index aa3bb89e3..20941d607 100644 --- a/README.md +++ b/README.md @@ -39,6 +39,7 @@ Below are the features and tasks of this framework: - `StarCoderFIM`: which uses the default FIM tokens `"", "", ""`, and - `SantaCoderFIM`: which uses SantaCoder FIM tokens `"", "", ""` - [Mercury](https://huggingface.co/datasets/Elfsong/Mercury) for evaluating computational efficiency of **Python** code generation. + - [ENAMEL](https://github.com/q-rz/enamel) evaluates the efficiency ($\textnormal{eff@}k$) of generated code compared with **expert-written** reference solutions under HumanEval problems. More details about each task can be found in the documentation in [`docs/README.md`](https://github.com/bigcode-project/bigcode-evaluation-harness/blob/main/docs/README.md). ## Setup diff --git a/bigcode_eval/tasks/enamel.py b/bigcode_eval/tasks/enamel.py new file mode 100644 index 000000000..357d8eabc --- /dev/null +++ b/bigcode_eval/tasks/enamel.py @@ -0,0 +1,90 @@ +"""How efficient is LLM-generated code? A rigorous & high-standard benchmark +https://arxiv.org/pdf/2406.06647 + +ENAMEL is a rigorous & high-standard benchmark for evaluating the efficiency of generated code +compared with **expert-written** reference solutions under 142 HumanEval problems + +Homepage: https://github.com/q-rz/enamel +""" + +from warnings import warn +from bigcode_eval.humaneval import GeneralHumanEval + +_CITATION = """ +@article{qiu2024enamel, + title={How efficient is {LLM}-generated code? A rigorous \& high-standard benchmark}, + author={Qiu, Ruizhong and Zeng, Weiliang Will and Tong, Hanghang and Ezick, James and Lott, Christopher}, + journal={arXiv preprint arXiv:2406.06647}, + year={2024} +} +""" + + +class ENAMEL(GeneralHumanEval): + """A task represents an entire benchmark including its dataset, problems, + answers, generation settings and evaluation methods. + """ + + DATASET_PATH = "q-rz/enamel" + DATASET_NAME = None + + def __init__(self, strip_prompt, k=[1, 10, 100], num_workers=16, timeout_factor=): # TODO + super().__init__(strip_prompt=strip_prompt, k=k, num_workers=num_workers, timeout=None) + # TODO + + def get_dataset(self): + # TODO: retrieve the evaluation subset from the loaded dataset (e.g. `self.dataset["test"]`) + """Returns dataset for the task or an iterable of any object, that get_prompt can handle""" + return [] + + def fewshot_examples(self): + # TODO: load few-shot examples (from bigcode_eval/tasks/fewshot_examples) if they exist + """Loads and returns the few-shot examples for the task if they exist.""" + pass + + def get_prompt(self, doc): + # TODO: build the prompt for the language model from a sample `doc` from the dataset + """ + Builds the prompt for the LM to generate from. + :param doc: dict[str: str] + sample from the test dataset + :return: str + """ + return "" + + def get_reference(self, doc): + # TODO: get the reference solution from a sample `doc` from the dataset + """ + Builds the reference solution for the doc (sample from the test dataset). + :param doc: dict[str: str] + sample from the test dataset + :return: str + """ + return "" + + def postprocess_generation(self, generation, idx): + # TODO: define the postprocessing for the LM generation + """ + Defines the postprocessing for a LM generation. + :param generation: str + code generation from LM + :param idx: int (if needed) + index of doc in the dataset to which the generation belongs + :return: str + """ + return "" + + def process_results(self, generations, references): + # TODO: define how the evaluation score is computed from list of \ + # generations and reference solutions + """ + Takes the list of LM generations and evaluates them against ground truth references, + returning the metric for the generations as in {"metric_name": result}. + We encourage to directly load the metric from `evaluate` library to keep the code concise. + :param generations: list(list(str)) + list of lists containing generations + :param references: list(str) + list of str containing refrences + :return: dict[str: float] + """ + return {} From d23b9385cacd128d00da2e97eead7d7c2ce8bddf Mon Sep 17 00:00:00 2001 From: q-rz <100142775+q-rz@users.noreply.github.com> Date: Mon, 1 Jul 2024 00:13:08 +0000 Subject: [PATCH 02/28] Add a new benchmark ENAMEL --- bigcode_eval/tasks/enamel.py | 28 ++++++++++++++++++++++++---- 1 file changed, 24 insertions(+), 4 deletions(-) diff --git a/bigcode_eval/tasks/enamel.py b/bigcode_eval/tasks/enamel.py index 357d8eabc..cd9cb6d67 100644 --- a/bigcode_eval/tasks/enamel.py +++ b/bigcode_eval/tasks/enamel.py @@ -26,11 +26,31 @@ class ENAMEL(GeneralHumanEval): """ DATASET_PATH = "q-rz/enamel" - DATASET_NAME = None + DATASET_NAME = "ENAMEL_HumanEval" + DATASET_SUBSETS = { + "enamel": sorted(set(range(164)) - {2, 23, 41, 45, 53, 60, 71, 92, 97, 99, 102, 123, 124, 135, 137, 138, 144, 148, 156, 157, 159, 160}), + "enamel-algo": [10, 18, 36, 39, 40, 43, 46, 49, 55, 59, 63, 76, 83, 96, 107, 109, 114, 129, 147, 154], + "enamel-impl": [1, 5, 8, 9, 11, 12, 15, 16, 17, 19, 21, 22, 24, 25, 26, 27, 31, 33, 37, 38, 44, 48, 49, 50, 51, 52, 56, 57, 58, 59, 61, 64, 66, 69, 70, 72, 73, 74, 75, 78, 80, 82, 85, 87, 89, 91, 93, 94, 95, 96, 98, 100, 104, 105, 108, 110, 111, 112, 113, 116, 117, 118, 121, 122, 125, 127, 128, 131, 140, 142, 143, 150, 152, 155, 161], + "humaneval": list(range(164)), + } - def __init__(self, strip_prompt, k=[1, 10, 100], num_workers=16, timeout_factor=): # TODO - super().__init__(strip_prompt=strip_prompt, k=k, num_workers=num_workers, timeout=None) - # TODO + def __init__(self, + strip_prompt, k=[1, 10, 100], num_workers=16, timeout=20., + subset="enamel", # list of problem IDs, or one of {"enamel", "enamel-algo", "enamel-impl", "humaneval"} + hardness=[0., 3., 3., 4.], memory_giga=4., timeout_factor=2., tolerence_sec=0.01, tests_path="cache/eval~tests.pkl", + ): + super().__init__(strip_prompt=strip_prompt, k=k, num_workers=num_workers, timeout=timeout) + if isinstance(subset, list): + self.subset = subset + else: + assert subset in self.DATASET_SUBSETS, f"unknown subset {repr(subset)}" + self.subset = self.DATASET_SUBSETS[subset] + self.hardness = hardness + self.memory_giga = memory_giga + self.timeout_factor = timeout_factor + self.tolerence_sec = tolerence_sec + self.tests_path = tests_path + # TODO: load dataset and tests def get_dataset(self): # TODO: retrieve the evaluation subset from the loaded dataset (e.g. `self.dataset["test"]`) From 72e50d37251fff70a7522d36ddae69a74f0bb4be Mon Sep 17 00:00:00 2001 From: q-rz <100142775+q-rz@users.noreply.github.com> Date: Mon, 1 Jul 2024 00:16:41 +0000 Subject: [PATCH 03/28] Add a new benchmark ENAMEL --- bigcode_eval/tasks/enamel.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/bigcode_eval/tasks/enamel.py b/bigcode_eval/tasks/enamel.py index cd9cb6d67..94521a35b 100644 --- a/bigcode_eval/tasks/enamel.py +++ b/bigcode_eval/tasks/enamel.py @@ -1,8 +1,8 @@ """How efficient is LLM-generated code? A rigorous & high-standard benchmark https://arxiv.org/pdf/2406.06647 -ENAMEL is a rigorous & high-standard benchmark for evaluating the efficiency of generated code -compared with **expert-written** reference solutions under 142 HumanEval problems +ENAMEL is a rigorous & high-standard benchmark for evaluating the efficiency of generated +Python code compared with expert-written reference solutions under 142 HumanEval problems Homepage: https://github.com/q-rz/enamel """ From 3d4c2752dca4d044a666fb8eaa83da9a477bed4f Mon Sep 17 00:00:00 2001 From: q-rz <100142775+q-rz@users.noreply.github.com> Date: Mon, 1 Jul 2024 05:03:53 +0000 Subject: [PATCH 04/28] Add a new benchmark ENAMEL --- bigcode_eval/tasks/__init__.py | 3 +- bigcode_eval/tasks/enamel.py | 54 ++++++++++++++++++---------------- 2 files changed, 30 insertions(+), 27 deletions(-) diff --git a/bigcode_eval/tasks/__init__.py b/bigcode_eval/tasks/__init__.py index 8162a5f1a..e94f4099b 100644 --- a/bigcode_eval/tasks/__init__.py +++ b/bigcode_eval/tasks/__init__.py @@ -5,7 +5,7 @@ concode, ds1000, gsm, humaneval, humanevalplus, humanevalpack, instruct_humaneval, instruct_wizard_humaneval, mbpp, mbppplus, multiple, parity, python_bugs, quixbugs, recode, santacoder_fim, - studenteval, mercury) + studenteval, mercury, enamel) TASK_REGISTRY = { **apps.create_all_tasks(), @@ -31,6 +31,7 @@ **santacoder_fim.create_all_tasks(), "studenteval": studenteval.StudentEval, "mercury": mercury.Mercury, + **enamel.create_all_tasks(), } ALL_TASKS = sorted(list(TASK_REGISTRY)) diff --git a/bigcode_eval/tasks/enamel.py b/bigcode_eval/tasks/enamel.py index 94521a35b..7952ee4fe 100644 --- a/bigcode_eval/tasks/enamel.py +++ b/bigcode_eval/tasks/enamel.py @@ -8,7 +8,7 @@ """ from warnings import warn -from bigcode_eval.humaneval import GeneralHumanEval +from bigcode_eval.tasks.humaneval import GeneralHumanEval _CITATION = """ @article{qiu2024enamel, @@ -20,7 +20,7 @@ """ -class ENAMEL(GeneralHumanEval): +class GeneralENAMEL(GeneralHumanEval): """A task represents an entire benchmark including its dataset, problems, answers, generation settings and evaluation methods. """ @@ -28,18 +28,17 @@ class ENAMEL(GeneralHumanEval): DATASET_PATH = "q-rz/enamel" DATASET_NAME = "ENAMEL_HumanEval" DATASET_SUBSETS = { - "enamel": sorted(set(range(164)) - {2, 23, 41, 45, 53, 60, 71, 92, 97, 99, 102, 123, 124, 135, 137, 138, 144, 148, 156, 157, 159, 160}), - "enamel-algo": [10, 18, 36, 39, 40, 43, 46, 49, 55, 59, 63, 76, 83, 96, 107, 109, 114, 129, 147, 154], - "enamel-impl": [1, 5, 8, 9, 11, 12, 15, 16, 17, 19, 21, 22, 24, 25, 26, 27, 31, 33, 37, 38, 44, 48, 49, 50, 51, 52, 56, 57, 58, 59, 61, 64, 66, 69, 70, 72, 73, 74, 75, 78, 80, 82, 85, 87, 89, 91, 93, 94, 95, 96, 98, 100, 104, 105, 108, 110, 111, 112, 113, 116, 117, 118, 121, 122, 125, 127, 128, 131, 140, 142, 143, 150, 152, 155, 161], - "humaneval": list(range(164)), + "ENAMEL": sorted(set(range(164)) - {2, 23, 41, 45, 53, 60, 71, 92, 97, 99, 102, 123, 124, 135, 137, 138, 144, 148, 156, 157, 159, 160}), + "ENAMEL_Algo": [10, 18, 36, 39, 40, 43, 46, 49, 55, 59, 63, 76, 83, 96, 107, 109, 114, 129, 147, 154], + "ENAMEL_Impl": [1, 5, 8, 9, 11, 12, 15, 16, 17, 19, 21, 22, 24, 25, 26, 27, 31, 33, 37, 38, 44, 48, 49, 50, 51, 52, 56, 57, 58, 59, 61, 64, 66, 69, 70, 72, 73, 74, 75, 78, 80, 82, 85, 87, 89, 91, 93, 94, 95, 96, 98, 100, 104, 105, 108, 110, 111, 112, 113, 116, 117, 118, 121, 122, 125, 127, 128, 131, 140, 142, 143, 150, 152, 155, 161], } def __init__(self, - strip_prompt, k=[1, 10, 100], num_workers=16, timeout=20., - subset="enamel", # list of problem IDs, or one of {"enamel", "enamel-algo", "enamel-impl", "humaneval"} + subset, # list of problem IDs, or one of the predefined subsets hardness=[0., 3., 3., 4.], memory_giga=4., timeout_factor=2., tolerence_sec=0.01, tests_path="cache/eval~tests.pkl", + strip_prompt=True, k=[1, 10, 100], num_workers=16, ): - super().__init__(strip_prompt=strip_prompt, k=k, num_workers=num_workers, timeout=timeout) + super().__init__(strip_prompt=strip_prompt, k=k, num_workers=num_workers, timeout=None) # each problem has a different time limit if isinstance(subset, list): self.subset = subset else: @@ -53,24 +52,8 @@ def __init__(self, # TODO: load dataset and tests def get_dataset(self): - # TODO: retrieve the evaluation subset from the loaded dataset (e.g. `self.dataset["test"]`) """Returns dataset for the task or an iterable of any object, that get_prompt can handle""" - return [] - - def fewshot_examples(self): - # TODO: load few-shot examples (from bigcode_eval/tasks/fewshot_examples) if they exist - """Loads and returns the few-shot examples for the task if they exist.""" - pass - - def get_prompt(self, doc): - # TODO: build the prompt for the language model from a sample `doc` from the dataset - """ - Builds the prompt for the LM to generate from. - :param doc: dict[str: str] - sample from the test dataset - :return: str - """ - return "" + return self.dataset["ENAMEL_HumanEval"].iloc[np.array(self.subset), :] def get_reference(self, doc): # TODO: get the reference solution from a sample `doc` from the dataset @@ -108,3 +91,22 @@ def process_results(self, generations, references): :return: dict[str: float] """ return {} + + +def create_task(subset): + class ENAMEL(GeneralEnamel): + __name__ = subset + __qualname__ = subset + def __init__(self, *args, **kwargs): + super().__init__(subset = subset, *args, **kwargs) + return ENAMEL + +def create_all_tasks(): + """Creates a dictionary of tasks from a list of levels + :return: {task_name: task} + """ + return { + "enamel": create_task(subset = "ENAMEL"), + "enamel-algo": create_task(subset = "ENAMEL_Algo"), + "enamel-impl": create_task(subset = "ENAMEL_Impl"), + } From 7847bb073333537a996871a02b3c959f4e881606 Mon Sep 17 00:00:00 2001 From: q-rz <100142775+q-rz@users.noreply.github.com> Date: Mon, 1 Jul 2024 05:07:50 +0000 Subject: [PATCH 05/28] Add a new benchmark ENAMEL --- bigcode_eval/tasks/custom_metrics/enamel_eval.py | 1 + bigcode_eval/tasks/enamel.py | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) create mode 100644 bigcode_eval/tasks/custom_metrics/enamel_eval.py diff --git a/bigcode_eval/tasks/custom_metrics/enamel_eval.py b/bigcode_eval/tasks/custom_metrics/enamel_eval.py new file mode 100644 index 000000000..1d44af559 --- /dev/null +++ b/bigcode_eval/tasks/custom_metrics/enamel_eval.py @@ -0,0 +1 @@ +# TODO: eff@k diff --git a/bigcode_eval/tasks/enamel.py b/bigcode_eval/tasks/enamel.py index 7952ee4fe..1a612ea22 100644 --- a/bigcode_eval/tasks/enamel.py +++ b/bigcode_eval/tasks/enamel.py @@ -94,7 +94,7 @@ def process_results(self, generations, references): def create_task(subset): - class ENAMEL(GeneralEnamel): + class ENAMEL(GeneralENAMEL): __name__ = subset __qualname__ = subset def __init__(self, *args, **kwargs): From 48c2f1c40e60339a78f1788dd1cfe355a1d73c61 Mon Sep 17 00:00:00 2001 From: q-rz <100142775+q-rz@users.noreply.github.com> Date: Mon, 1 Jul 2024 06:35:35 +0000 Subject: [PATCH 06/28] Add a new benchmark ENAMEL --- bigcode_eval/tasks/enamel.py | 37 ++++++++++++++++++------------------ 1 file changed, 18 insertions(+), 19 deletions(-) diff --git a/bigcode_eval/tasks/enamel.py b/bigcode_eval/tasks/enamel.py index 1a612ea22..f09d0a1ec 100644 --- a/bigcode_eval/tasks/enamel.py +++ b/bigcode_eval/tasks/enamel.py @@ -8,7 +8,9 @@ """ from warnings import warn +import numpy as np from bigcode_eval.tasks.humaneval import GeneralHumanEval +from bigcode_eval.custom_metrics.enamel_eval import _CITATION = """ @article{qiu2024enamel, @@ -27,14 +29,8 @@ class GeneralENAMEL(GeneralHumanEval): DATASET_PATH = "q-rz/enamel" DATASET_NAME = "ENAMEL_HumanEval" - DATASET_SUBSETS = { - "ENAMEL": sorted(set(range(164)) - {2, 23, 41, 45, 53, 60, 71, 92, 97, 99, 102, 123, 124, 135, 137, 138, 144, 148, 156, 157, 159, 160}), - "ENAMEL_Algo": [10, 18, 36, 39, 40, 43, 46, 49, 55, 59, 63, 76, 83, 96, 107, 109, 114, 129, 147, 154], - "ENAMEL_Impl": [1, 5, 8, 9, 11, 12, 15, 16, 17, 19, 21, 22, 24, 25, 26, 27, 31, 33, 37, 38, 44, 48, 49, 50, 51, 52, 56, 57, 58, 59, 61, 64, 66, 69, 70, 72, 73, 74, 75, 78, 80, 82, 85, 87, 89, 91, 93, 94, 95, 96, 98, 100, 104, 105, 108, 110, 111, 112, 113, 116, 117, 118, 121, 122, 125, 127, 128, 131, 140, 142, 143, 150, 152, 155, 161], - } - def __init__(self, - subset, # list of problem IDs, or one of the predefined subsets + def __init__(self, subset, # list of problem IDs hardness=[0., 3., 3., 4.], memory_giga=4., timeout_factor=2., tolerence_sec=0.01, tests_path="cache/eval~tests.pkl", strip_prompt=True, k=[1, 10, 100], num_workers=16, ): @@ -44,29 +40,29 @@ def __init__(self, else: assert subset in self.DATASET_SUBSETS, f"unknown subset {repr(subset)}" self.subset = self.DATASET_SUBSETS[subset] + self.dataset[self.__name__] = self.dataset["ENAMEL_HumanEval"].iloc[np.array(self.subset), :] # TODO self.hardness = hardness self.memory_giga = memory_giga self.timeout_factor = timeout_factor self.tolerence_sec = tolerence_sec self.tests_path = tests_path - # TODO: load dataset and tests + # TODO: load tests from tests_path def get_dataset(self): """Returns dataset for the task or an iterable of any object, that get_prompt can handle""" - return self.dataset["ENAMEL_HumanEval"].iloc[np.array(self.subset), :] + return self.dataset[self.__name__] def get_reference(self, doc): # TODO: get the reference solution from a sample `doc` from the dataset """ Builds the reference solution for the doc (sample from the test dataset). - :param doc: dict[str: str] + :param doc: dict{str: str} sample from the test dataset :return: str """ return "" def postprocess_generation(self, generation, idx): - # TODO: define the postprocessing for the LM generation """ Defines the postprocessing for a LM generation. :param generation: str @@ -75,7 +71,9 @@ def postprocess_generation(self, generation, idx): index of doc in the dataset to which the generation belongs :return: str """ - return "" + prompt = self.get_prompt(self.get_dataset()[idx]) + generation = self._stop_at_stop_token(generation, self.stop_words) + return prompt + "\n pass\n" + generation # this should work no matter generation contains prompt or not def process_results(self, generations, references): # TODO: define how the evaluation score is computed from list of \ @@ -93,12 +91,13 @@ def process_results(self, generations, references): return {} -def create_task(subset): +def create_task(name, subset): class ENAMEL(GeneralENAMEL): - __name__ = subset - __qualname__ = subset + __name__ = name + __qualname__ = name + SUBSET = subset def __init__(self, *args, **kwargs): - super().__init__(subset = subset, *args, **kwargs) + super().__init__(subset=self.SUBSET, *args, **kwargs) return ENAMEL def create_all_tasks(): @@ -106,7 +105,7 @@ def create_all_tasks(): :return: {task_name: task} """ return { - "enamel": create_task(subset = "ENAMEL"), - "enamel-algo": create_task(subset = "ENAMEL_Algo"), - "enamel-impl": create_task(subset = "ENAMEL_Impl"), + "enamel": create_task(name="ENAMEL", subset=sorted(set(range(164)) - {2, 23, 41, 45, 53, 60, 71, 92, 97, 99, 102, 123, 124, 135, 137, 138, 144, 148, 156, 157, 159, 160})), + "enamel-algo": create_task(name="ENAMEL_Algo", subset=[10, 18, 36, 39, 40, 43, 46, 49, 55, 59, 63, 76, 83, 96, 107, 109, 114, 129, 147, 154]), + "enamel-impl": create_task(name="ENAMEL_Impl", subset=[1, 5, 8, 9, 11, 12, 15, 16, 17, 19, 21, 22, 24, 25, 26, 27, 31, 33, 37, 38, 44, 48, 49, 50, 51, 52, 56, 57, 58, 59, 61, 64, 66, 69, 70, 72, 73, 74, 75, 78, 80, 82, 85, 87, 89, 91, 93, 94, 95, 96, 98, 100, 104, 105, 108, 110, 111, 112, 113, 116, 117, 118, 121, 122, 125, 127, 128, 131, 140, 142, 143, 150, 152, 155, 161]), } From 02c43e9a6540d836afce1e7abd18034d414bb349 Mon Sep 17 00:00:00 2001 From: q-rz <100142775+q-rz@users.noreply.github.com> Date: Mon, 1 Jul 2024 07:39:12 +0000 Subject: [PATCH 07/28] Add a new benchmark ENAMEL --- .../tasks/custom_metrics/enamel_eval.py | 12 ++++++++ bigcode_eval/tasks/enamel.py | 30 +++++++++++-------- 2 files changed, 29 insertions(+), 13 deletions(-) diff --git a/bigcode_eval/tasks/custom_metrics/enamel_eval.py b/bigcode_eval/tasks/custom_metrics/enamel_eval.py index 1d44af559..ff7b04464 100644 --- a/bigcode_eval/tasks/custom_metrics/enamel_eval.py +++ b/bigcode_eval/tasks/custom_metrics/enamel_eval.py @@ -1 +1,13 @@ # TODO: eff@k + +def evaluate_all(generations, references, k, hardness, n_reps, memory_giga, timeout_factor, tolerence_sec): + # TODO + +def might_catch_timeout_signal(): + # TODO + +might_catch_timeout_signal.WARNING = """\ +We have detected that the generated code samples use `try ... except` within a loop, which might catch \ +our timeout signal and cause a dead loop. Since resolving this rare issue via `multiprocessing` would \ +significantly slow down the evaluation process for our large-scale inputs, we have decided not to resolve \ +this issue. If this issue does happen, please consider removing the corresponding code samples.""" diff --git a/bigcode_eval/tasks/enamel.py b/bigcode_eval/tasks/enamel.py index f09d0a1ec..489622472 100644 --- a/bigcode_eval/tasks/enamel.py +++ b/bigcode_eval/tasks/enamel.py @@ -7,11 +7,6 @@ Homepage: https://github.com/q-rz/enamel """ -from warnings import warn -import numpy as np -from bigcode_eval.tasks.humaneval import GeneralHumanEval -from bigcode_eval.custom_metrics.enamel_eval import - _CITATION = """ @article{qiu2024enamel, title={How efficient is {LLM}-generated code? A rigorous \& high-standard benchmark}, @@ -22,6 +17,12 @@ """ +from warnings import warn +import numpy as np +from bigcode_eval.tasks.humaneval import GeneralHumanEval +from bigcode_eval.custom_metrics.enamel_eval import evaluate_all, might_catch_timeout_signal + + class GeneralENAMEL(GeneralHumanEval): """A task represents an entire benchmark including its dataset, problems, answers, generation settings and evaluation methods. @@ -31,10 +32,10 @@ class GeneralENAMEL(GeneralHumanEval): DATASET_NAME = "ENAMEL_HumanEval" def __init__(self, subset, # list of problem IDs - hardness=[0., 3., 3., 4.], memory_giga=4., timeout_factor=2., tolerence_sec=0.01, tests_path="cache/eval~tests.pkl", - strip_prompt=True, k=[1, 10, 100], num_workers=16, + hardness=[0., 3., 3., 4.], n_reps = 6, memory_giga=4., timeout_factor=2., tolerence_sec=0.01, tests_path="cache/eval~tests.pkl", + strip_prompt=True, k=[1, 10, 100], ): - super().__init__(strip_prompt=strip_prompt, k=k, num_workers=num_workers, timeout=None) # each problem has a different time limit + super().__init__(strip_prompt=strip_prompt, k=k, num_workers=1, timeout=None) # each problem has a different time limit if isinstance(subset, list): self.subset = subset else: @@ -42,6 +43,7 @@ def __init__(self, subset, # list of problem IDs self.subset = self.DATASET_SUBSETS[subset] self.dataset[self.__name__] = self.dataset["ENAMEL_HumanEval"].iloc[np.array(self.subset), :] # TODO self.hardness = hardness + self.n_reps = n_reps self.memory_giga = memory_giga self.timeout_factor = timeout_factor self.tolerence_sec = tolerence_sec @@ -60,7 +62,7 @@ def get_reference(self, doc): sample from the test dataset :return: str """ - return "" + return "" # TODO: include tests def postprocess_generation(self, generation, idx): """ @@ -73,22 +75,24 @@ def postprocess_generation(self, generation, idx): """ prompt = self.get_prompt(self.get_dataset()[idx]) generation = self._stop_at_stop_token(generation, self.stop_words) + if (not self.warned_dead_loop) and might_catch_timeout_signal(generation): + warn(might_catch_timeout_signal.WARNING) return prompt + "\n pass\n" + generation # this should work no matter generation contains prompt or not def process_results(self, generations, references): - # TODO: define how the evaluation score is computed from list of \ - # generations and reference solutions """ Takes the list of LM generations and evaluates them against ground truth references, returning the metric for the generations as in {"metric_name": result}. - We encourage to directly load the metric from `evaluate` library to keep the code concise. :param generations: list(list(str)) list of lists containing generations :param references: list(str) list of str containing refrences :return: dict[str: float] """ - return {} + return evaluate_all( + generations, references, k=self.k, hardness=self.hardness, n_reps=self.n_reps, + memory_giga=self.memory_giga, timeout_factor=self.timeout_factor, tolerence_sec=self.tolerence_sec, + ) def create_task(name, subset): From d1e10b97615d1e9dee805db159deb0e0d4b44155 Mon Sep 17 00:00:00 2001 From: q-rz <100142775+q-rz@users.noreply.github.com> Date: Thu, 18 Jul 2024 06:38:20 +0000 Subject: [PATCH 08/28] Add a new benchmark ENAMEL --- .../tasks/custom_metrics/enamel_eval.py | 21 +++++++++++++++++-- bigcode_eval/tasks/enamel.py | 4 +++- 2 files changed, 22 insertions(+), 3 deletions(-) diff --git a/bigcode_eval/tasks/custom_metrics/enamel_eval.py b/bigcode_eval/tasks/custom_metrics/enamel_eval.py index ff7b04464..bfc0e19b8 100644 --- a/bigcode_eval/tasks/custom_metrics/enamel_eval.py +++ b/bigcode_eval/tasks/custom_metrics/enamel_eval.py @@ -1,10 +1,27 @@ # TODO: eff@k + +class Unpickler(pickle.Unpickler): + CLS_DICT = {'': Test, '': Refs} + def find_class(self, module, name): + if module in self.CLS_DICT: + return self.CLS_DICT[module] + else: + return super().find_class(module, name) + + def evaluate_all(generations, references, k, hardness, n_reps, memory_giga, timeout_factor, tolerence_sec): # TODO -def might_catch_timeout_signal(): - # TODO + +def might_catch_timeout_signal(generation, pattern_seq = (' while ', ' try:')): + i = 0 + for pattern in pattern_seq: + i = generarion.find(pattern, i) + if i == -1: + return False + i += len(pattern) + return True might_catch_timeout_signal.WARNING = """\ We have detected that the generated code samples use `try ... except` within a loop, which might catch \ diff --git a/bigcode_eval/tasks/enamel.py b/bigcode_eval/tasks/enamel.py index 489622472..09ef082ba 100644 --- a/bigcode_eval/tasks/enamel.py +++ b/bigcode_eval/tasks/enamel.py @@ -17,10 +17,11 @@ """ +import pickle from warnings import warn import numpy as np from bigcode_eval.tasks.humaneval import GeneralHumanEval -from bigcode_eval.custom_metrics.enamel_eval import evaluate_all, might_catch_timeout_signal +from bigcode_eval.custom_metrics.enamel_eval import Unpickler, evaluate_all, might_catch_timeout_signal class GeneralENAMEL(GeneralHumanEval): @@ -77,6 +78,7 @@ def postprocess_generation(self, generation, idx): generation = self._stop_at_stop_token(generation, self.stop_words) if (not self.warned_dead_loop) and might_catch_timeout_signal(generation): warn(might_catch_timeout_signal.WARNING) + self.warned_dead_loop = True return prompt + "\n pass\n" + generation # this should work no matter generation contains prompt or not def process_results(self, generations, references): From 86f3902209e949835903af205dc5f98c4ed3a897 Mon Sep 17 00:00:00 2001 From: q-rz <100142775+q-rz@users.noreply.github.com> Date: Thu, 18 Jul 2024 07:30:38 +0000 Subject: [PATCH 09/28] Add a new benchmark ENAMEL --- .../tasks/custom_metrics/enamel_eval.py | 325 +++++++++++++++++- 1 file changed, 321 insertions(+), 4 deletions(-) diff --git a/bigcode_eval/tasks/custom_metrics/enamel_eval.py b/bigcode_eval/tasks/custom_metrics/enamel_eval.py index bfc0e19b8..f65813582 100644 --- a/bigcode_eval/tasks/custom_metrics/enamel_eval.py +++ b/bigcode_eval/tasks/custom_metrics/enamel_eval.py @@ -1,18 +1,176 @@ -# TODO: eff@k +from copy import deepcopy +import gc +import pickle +import time +import os, os.path as osp +import sys +import resource +import platform +import contextlib + +import numpy as np + +def calc_exec_time(ts): # Hodges--Lehmann estimator + ts = np.array(ts) / 2. + ts = ts[None, :] + ts[:, None] + ts = ts[np.tril_indices_from(ts)] + return np.median(ts) + +def calc_eff(elapsed, ref, timeout): + return max(0., timeout - elapsed) / (timeout - ref) + +def calc_eff_at_k(e, k): # numerically stable implementation + n = len(e) + lbd = [k / n] + k_ = k - 1 + for r in range(n - 1, k_, -1): + lbd.append(lbd[-1] * (1 - k_ / r)) + lbd = np.flip(lbd) + e = np.sort(e)[k_ :] + return (lbd * e).sum() + +def calc_pass_at_k(n, c, k): # from the HumanEval paper + if n - c < k: return 1.0 + return 1.0 - np.prod(1.0 - k / np.arange(n - c + 1, n + 1)) + +class Test: # a test case + def __init__(self, input = None, answer = None, ref = None): + self.input = input + self.answer = answer + self.ref = ref # reference execution time + +class Refs: # references for efficiency evaluation + def __init__(self, tests, hardness): + neg_inf = float('-inf') + self.refs = [neg_inf] * len(hardness) + self.ref_max = neg_inf + self.lid = None + self.cid = None + # finds the longest reference execution time for calibration + for j, (size, tests_j) in enumerate(tests): + if hardness[j]: + for k, test in enumerate(tests_j): + if self.refs[j] < test.ref: + self.refs[j] = test.ref + if self.ref_max < test.ref: + self.ref_max = test.ref + self.lid = j + self.cid = k class Unpickler(pickle.Unpickler): - CLS_DICT = {'': Test, '': Refs} + CLS_DICT = {'enam.evaluate.Test': Test, 'enam.evaluate.Refs': Refs} def find_class(self, module, name): if module in self.CLS_DICT: return self.CLS_DICT[module] else: return super().find_class(module, name) +TPL_RUN = '''%s +%s +__t0 = time.time() +__output = %s(*__input) +__t1 = time.time() +''' # % (prompt, solution, entry_point) +TPL_TEST = '''%s + pass +%s +__accepted = __check(__input, __answer, __output) +''' # % (prompt, checker) + +def evaluate_one(code, problem, tests, refs, k, hardness, n_reps, memory_giga, timeout_factor, tolerence_sec, time_correction): + timeout = timeout_factor * refs.ref_max + memory_bytes = memory_giga * (1024 ** 3) + effs = [] + elapsed_list = [] + for j, (size, tests_j) in enumerate(tests): + n_reps_j = n_reps[j] + level_elapsed = [] + level_break = False + for k, test in enumerate(tests_j): + elapsed = [None for rep in range(n_reps_j)] + for rep in range(n_reps): + scope = dict(time = time, input = None, print = None, __input = deepcopy(test.input)) # in case that the code modifies the input + try: + unsafe_timed_execute(TPL_RUN % (problem.prompt, code, problem.entry_point), scope, memory_bytes, timeout + tolerence_sec) + scope['__input'] = test.input + scope['__answer'] = test.answer # to prevent the code reading the answer + unsafe_execute(TPL_TEST % (problem.prompt, problem.checker), scope) # assuming that the checker does not modify the input + except TimeoutException as e: + level_break = True + break + except MemoryError as e: + level_break = True + break + except OverflowError as e: + level_break = True + break + except KeyboardInterrupt as e: + raise e + except BaseException as e: + return False, self.zero_effs(), elapsed_list + else: + if '__accepted' in scope and scope['__accepted']: + elapsed[rep] = scope['__t1'] - scope['__t0'] + else: + return False, self.zero_effs(), elapsed_list + if level_break: + break + else: + level_elapsed.append(calc_exec_time(elapsed).item() * time_correction) + elapsed_list.append(level_elapsed) + if level_break: + break + else: + effs.append(calc_eff(elapsed = max(level_elapsed), ref = refs.refs[j], timeout = timeout)) + if j == 0 and level_break: + return False, self.zero_effs(), elapsed_list + for j in range(len(effs), self.n_levels): + effs.append(0.) + return True, effs, elapsed_list -def evaluate_all(generations, references, k, hardness, n_reps, memory_giga, timeout_factor, tolerence_sec): - # TODO +def get_time_correction(problem, tests, refs, n_reps): # computes the calibration factor of of execution time + j = refs.lid + k = refs.cid + test = tests[j][-1][k] + n_reps_j = n_reps[j] + elapsed = [None for rep in range(n_reps_j)] + for rep in range(n_reps_j): + scope = dict(time = time, __input = deepcopy(test.input)) # in case that the code modifies the input + unsafe_execute(TPL_RUN % (problem.prompt, problem.reference_solution, problem.entry_point), scope) # assuming that the reference solution is error-free + elapsed[rep] = scope['__t1'] - scope['__t0'] + elapsed = calc_exec_time(elapsed).item() + return refs.ref_max / elapsed +def evaluate_all(problems, codes, tests, refs, k, hardness, n_reps, memory_giga, timeout_factor, tolerence_sec): + if isinstance(k, int): + k = [k] + min_codes = min(len(codes_i) for codes_i in codes) + k = sorted({k_ for k_ in k if k_ <= min_codes}) + passes = [[] for k_ in k] + effs = [[] for k_ in k] + gc.collect() + for problem, codes_i, tests_i, refs_i in zip(problems, codes, tests, refs): + time_correction = get_time_correction(problem = problem, tests = tests_i, refs = refs_i, n_reps = n_reps) + n_levels = len(tests_i) + problem_passes = [] + problem_effs = [] + for code in codes_i: + passed, code_effs, code_elapsed = evaluate_one( + code = code, problem = problem, tests = tests_i, refs = refs_i, + k = k, hardness = hardness, n_reps = n_reps, memory_giga = memory_giga, + timeout_factor = timeout_factor, tolerence_sec = tolerence_sec, time_correction = time_correction) + problem_passes.append(passed) + problem_effs.append(code_effs) + for j, k_ in enumerate(k): + passes[j].append(calc_pass_at_k(n = len(problem_passes), c = sum(problem_passes), k = k_)) + effs[j].append(calc_eff_at_k(e = np.average(problem_effs, axis = 1, weights = hardness), k = k_)) + metrics = dict() + for k_, pass_k in zip(k, passes): + metrics[f'pass@{k_}'] = np.mean(pass_k).item() + for k_, eff_k in zip(k, effs): + metrics[f'eff@{k_}'] = np.mean(eff_k).item() + return metrics def might_catch_timeout_signal(generation, pattern_seq = (' while ', ' try:')): i = 0 @@ -28,3 +186,162 @@ def might_catch_timeout_signal(generation, pattern_seq = (' while ', ' tr our timeout signal and cause a dead loop. Since resolving this rare issue via `multiprocessing` would \ significantly slow down the evaluation process for our large-scale inputs, we have decided not to resolve \ this issue. If this issue does happen, please consider removing the corresponding code samples.""" + +"""The following functions are adapted from code_eval (@link https://huggingface.co/spaces/evaluate-metric/code_eval)""" + +def get_memory_usage(): + return sys.getsizeof(sys.modules[__name__]) + +@contextlib.contextmanager +def set_memory_limit(maximum_memory_bytes = None): + try: + if maximum_memory_bytes is not None: + _not_darwin = (not platform.uname().system == "Darwin") + _rlimit_as = resource.getrlimit(resource.RLIMIT_AS) + _rlimit_data = resource.getrlimit(resource.RLIMIT_DATA) + if _not_darwin: + _rlimit_stack = resource.getrlimit(resource.RLIMIT_STACK) + memory_limit = int(get_memory_usage() + maximum_memory_bytes) + resource.setrlimit(resource.RLIMIT_AS, (memory_limit, _rlimit_as[-1])) + resource.setrlimit(resource.RLIMIT_DATA, (memory_limit, _rlimit_data[-1])) + if _not_darwin: + resource.setrlimit(resource.RLIMIT_STACK, (memory_limit, _rlimit_stack[-1])) + yield + finally: + if maximum_memory_bytes is not None: + resource.setrlimit(resource.RLIMIT_AS, _rlimit_as) + resource.setrlimit(resource.RLIMIT_DATA, _rlimit_data) + if _not_darwin: + resource.setrlimit(resource.RLIMIT_STACK, _rlimit_stack) + +class TimeoutException(Exception): + pass + +def timeout_signal_handler(signum, frame): + raise TimeoutException("Timed out!") + +@contextlib.contextmanager +def set_time_limit(seconds): + import signal + signal.setitimer(signal.ITIMER_REAL, seconds) + signal.signal(signal.SIGALRM, timeout_signal_handler) + try: + yield + finally: + signal.setitimer(signal.ITIMER_REAL, 0) + +import io + +class WriteOnlyStringIO(io.StringIO): + def read(self, *args, **kwargs): + raise OSError + def readline(self, *args, **kwargs): + raise OSError + def readlines(self, *args, **kwargs): + raise OSError + def readable(self, *args, **kwargs): + return False + +class redirect_stdin(contextlib._RedirectStream): # type: ignore + _stream = "stdin" + +@contextlib.contextmanager +def swallow_io(): + stream = WriteOnlyStringIO() + with contextlib.redirect_stdout(stream): + with contextlib.redirect_stderr(stream): + with redirect_stdin(stream): + yield + +@contextlib.contextmanager +def chdir(root): + if root == ".": + yield + return + cwd = os.getcwd() + os.chdir(root) + try: + yield + except BaseException as exc: + raise exc + finally: + os.chdir(cwd) + +@contextlib.contextmanager +def create_tempdir(): + import tempfile + with tempfile.TemporaryDirectory() as dirname: + with chdir(dirname): + yield dirname + +@contextlib.contextmanager +def reliability_guard(): + """ + This disables various destructive functions and prevents the generated code + from interfering with the test (e.g. fork bomb, killing other processes, + removing filesystem files, etc.) + + WARNING + This function is NOT a security sandbox. Untrusted code, including, model- + generated code, should not be blindly executed outside of one. See the + Codex paper for more information about OpenAI's code sandbox, and proceed + with caution. + """ + + with create_tempdir(): + with swallow_io(): + try: + + import faulthandler + + faulthandler.disable() + + import builtins, os, shutil, subprocess + + os.environ["OMP_NUM_THREADS"] = "1" + + _keys = dict( + builtins = ('exit', 'quit'), + os = ('kill', 'system', 'putenv', 'remove', 'removedirs', 'rmdir', 'fchdir', 'setuid', 'fork', 'forkpty', 'killpg', 'rename', 'renames', 'truncate', 'replace', 'unlink', 'fchmod', 'fchown', 'chmod', 'chown', 'chroot', 'lchflags', 'lchmod', 'lchown', 'getcwd', 'chdir'), + shutil = ('rmtree', 'move', 'chown'), + subprocess = ('Popen',), + ) + _baks = dict() + for lib, keys in _keys.items(): + obj = locals()[lib] + _bak = dict() + for key in keys: + if hasattr(obj, key): + _bak[key] = getattr(obj, key) + _baks[lib] = _bak + + #__builtins__["help"] = None + + yield + finally: + for lib, keys in _keys.items(): + obj = locals()[lib] + for key, val in _baks[lib].items(): + setattr(obj, key, val) + +def unsafe_execute(program: str, exec_globals: dict): + try: + gc_bak = gc.isenabled() + gc.disable() + with reliability_guard(): + exec(program, exec_globals) + finally: + if gc_bak: + gc.enable() + +def unsafe_timed_execute(program: str, exec_globals: dict, maximum_memory_bytes: float, time_limit_seconds: float): + try: + gc_bak = gc.isenabled() + gc.disable() + with reliability_guard(): + with set_memory_limit(maximum_memory_bytes): + with set_time_limit(time_limit_seconds): + exec(program, exec_globals) + finally: + if gc_bak: + gc.enable() From 4caa5dde9d69d6ff65088a260ee2a4c88cc1844b Mon Sep 17 00:00:00 2001 From: q-rz <100142775+q-rz@users.noreply.github.com> Date: Thu, 18 Jul 2024 07:37:30 +0000 Subject: [PATCH 10/28] Add a new benchmark ENAMEL --- bigcode_eval/tasks/custom_metrics/enamel_eval.py | 2 +- bigcode_eval/tasks/enamel.py | 3 +-- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/bigcode_eval/tasks/custom_metrics/enamel_eval.py b/bigcode_eval/tasks/custom_metrics/enamel_eval.py index f65813582..694e56d11 100644 --- a/bigcode_eval/tasks/custom_metrics/enamel_eval.py +++ b/bigcode_eval/tasks/custom_metrics/enamel_eval.py @@ -71,7 +71,7 @@ def find_class(self, module, name): __t0 = time.time() __output = %s(*__input) __t1 = time.time() -''' # % (prompt, solution, entry_point) +''' # % (prompt, code, entry_point) # this should work no matter code includes prompt or not TPL_TEST = '''%s pass %s diff --git a/bigcode_eval/tasks/enamel.py b/bigcode_eval/tasks/enamel.py index 09ef082ba..e865f5b65 100644 --- a/bigcode_eval/tasks/enamel.py +++ b/bigcode_eval/tasks/enamel.py @@ -74,12 +74,11 @@ def postprocess_generation(self, generation, idx): index of doc in the dataset to which the generation belongs :return: str """ - prompt = self.get_prompt(self.get_dataset()[idx]) generation = self._stop_at_stop_token(generation, self.stop_words) if (not self.warned_dead_loop) and might_catch_timeout_signal(generation): warn(might_catch_timeout_signal.WARNING) self.warned_dead_loop = True - return prompt + "\n pass\n" + generation # this should work no matter generation contains prompt or not + return generation def process_results(self, generations, references): """ From 027afcb58384287509cac5717b856ed77c2c7b19 Mon Sep 17 00:00:00 2001 From: q-rz <100142775+q-rz@users.noreply.github.com> Date: Thu, 18 Jul 2024 07:55:08 +0000 Subject: [PATCH 11/28] Add a new benchmark ENAMEL --- bigcode_eval/tasks/custom_metrics/enamel_eval.py | 5 ++--- bigcode_eval/tasks/enamel.py | 3 ++- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/bigcode_eval/tasks/custom_metrics/enamel_eval.py b/bigcode_eval/tasks/custom_metrics/enamel_eval.py index 694e56d11..dbd67cb9d 100644 --- a/bigcode_eval/tasks/custom_metrics/enamel_eval.py +++ b/bigcode_eval/tasks/custom_metrics/enamel_eval.py @@ -3,7 +3,8 @@ import pickle import time -import os, os.path as osp +import io +import os import sys import resource import platform @@ -230,8 +231,6 @@ def set_time_limit(seconds): finally: signal.setitimer(signal.ITIMER_REAL, 0) -import io - class WriteOnlyStringIO(io.StringIO): def read(self, *args, **kwargs): raise OSError diff --git a/bigcode_eval/tasks/enamel.py b/bigcode_eval/tasks/enamel.py index e865f5b65..f45f62106 100644 --- a/bigcode_eval/tasks/enamel.py +++ b/bigcode_eval/tasks/enamel.py @@ -44,7 +44,8 @@ def __init__(self, subset, # list of problem IDs self.subset = self.DATASET_SUBSETS[subset] self.dataset[self.__name__] = self.dataset["ENAMEL_HumanEval"].iloc[np.array(self.subset), :] # TODO self.hardness = hardness - self.n_reps = n_reps + self.n_levels = len(self.hardness) + self.n_reps = [n_reps if self.hardness[j] else 1 for j in range(self.n_levels)] # no need to repeat if it does not count into the efficiency score self.memory_giga = memory_giga self.timeout_factor = timeout_factor self.tolerence_sec = tolerence_sec From eb4310325b418352843107469e897f75e9fcd14c Mon Sep 17 00:00:00 2001 From: q-rz <100142775+q-rz@users.noreply.github.com> Date: Thu, 18 Jul 2024 08:15:07 +0000 Subject: [PATCH 12/28] Add a new benchmark ENAMEL --- bigcode_eval/tasks/enamel.py | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/bigcode_eval/tasks/enamel.py b/bigcode_eval/tasks/enamel.py index f45f62106..671461790 100644 --- a/bigcode_eval/tasks/enamel.py +++ b/bigcode_eval/tasks/enamel.py @@ -21,7 +21,7 @@ from warnings import warn import numpy as np from bigcode_eval.tasks.humaneval import GeneralHumanEval -from bigcode_eval.custom_metrics.enamel_eval import Unpickler, evaluate_all, might_catch_timeout_signal +from bigcode_eval.tasks.custom_metrics.enamel_eval import Unpickler, evaluate_all, might_catch_timeout_signal class GeneralENAMEL(GeneralHumanEval): @@ -30,18 +30,15 @@ class GeneralENAMEL(GeneralHumanEval): """ DATASET_PATH = "q-rz/enamel" - DATASET_NAME = "ENAMEL_HumanEval" + DATASET_NAME = "default" def __init__(self, subset, # list of problem IDs hardness=[0., 3., 3., 4.], n_reps = 6, memory_giga=4., timeout_factor=2., tolerence_sec=0.01, tests_path="cache/eval~tests.pkl", strip_prompt=True, k=[1, 10, 100], ): super().__init__(strip_prompt=strip_prompt, k=k, num_workers=1, timeout=None) # each problem has a different time limit - if isinstance(subset, list): - self.subset = subset - else: - assert subset in self.DATASET_SUBSETS, f"unknown subset {repr(subset)}" - self.subset = self.DATASET_SUBSETS[subset] + self.subset = subset + return # @TODO self.dataset[self.__name__] = self.dataset["ENAMEL_HumanEval"].iloc[np.array(self.subset), :] # TODO self.hardness = hardness self.n_levels = len(self.hardness) From f3e86ac72247ba322c00ae87be0a14452540f14e Mon Sep 17 00:00:00 2001 From: q-rz <100142775+q-rz@users.noreply.github.com> Date: Sun, 21 Jul 2024 19:26:40 +0000 Subject: [PATCH 13/28] Add a new benchmark ENAMEL --- bigcode_eval/tasks/enamel.py | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/bigcode_eval/tasks/enamel.py b/bigcode_eval/tasks/enamel.py index 671461790..4362d7574 100644 --- a/bigcode_eval/tasks/enamel.py +++ b/bigcode_eval/tasks/enamel.py @@ -20,6 +20,7 @@ import pickle from warnings import warn import numpy as np +from huggingface_hub import hf_hub_download from bigcode_eval.tasks.humaneval import GeneralHumanEval from bigcode_eval.tasks.custom_metrics.enamel_eval import Unpickler, evaluate_all, might_catch_timeout_signal @@ -31,6 +32,7 @@ class GeneralENAMEL(GeneralHumanEval): DATASET_PATH = "q-rz/enamel" DATASET_NAME = "default" + DATASET_FULL = "ENAMEL_HumanEval" def __init__(self, subset, # list of problem IDs hardness=[0., 3., 3., 4.], n_reps = 6, memory_giga=4., timeout_factor=2., tolerence_sec=0.01, tests_path="cache/eval~tests.pkl", @@ -38,20 +40,22 @@ def __init__(self, subset, # list of problem IDs ): super().__init__(strip_prompt=strip_prompt, k=k, num_workers=1, timeout=None) # each problem has a different time limit self.subset = subset - return # @TODO - self.dataset[self.__name__] = self.dataset["ENAMEL_HumanEval"].iloc[np.array(self.subset), :] # TODO + self.dataset_full = self.dataset[self.DATASET_FULL].to_pandas() + self.dataset = self.dataset.iloc[np.array(self.subset), :] self.hardness = hardness self.n_levels = len(self.hardness) self.n_reps = [n_reps if self.hardness[j] else 1 for j in range(self.n_levels)] # no need to repeat if it does not count into the efficiency score self.memory_giga = memory_giga self.timeout_factor = timeout_factor self.tolerence_sec = tolerence_sec - self.tests_path = tests_path + self.tests_path = hf_hub_download(repo_id = self.DATASET_PATH, filename = tests_path, repo_type = "dataset") # TODO: load tests from tests_path def get_dataset(self): """Returns dataset for the task or an iterable of any object, that get_prompt can handle""" - return self.dataset[self.__name__] + return self.dataset + + #TODO get_prompt def get_reference(self, doc): # TODO: get the reference solution from a sample `doc` from the dataset From 5aafc0e472c873ede52d27a5ff21f9e4a6e1f228 Mon Sep 17 00:00:00 2001 From: q-rz <100142775+q-rz@users.noreply.github.com> Date: Sun, 21 Jul 2024 19:49:03 +0000 Subject: [PATCH 14/28] Add a new benchmark ENAMEL --- bigcode_eval/tasks/enamel.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bigcode_eval/tasks/enamel.py b/bigcode_eval/tasks/enamel.py index 4362d7574..6772ac619 100644 --- a/bigcode_eval/tasks/enamel.py +++ b/bigcode_eval/tasks/enamel.py @@ -41,7 +41,7 @@ def __init__(self, subset, # list of problem IDs super().__init__(strip_prompt=strip_prompt, k=k, num_workers=1, timeout=None) # each problem has a different time limit self.subset = subset self.dataset_full = self.dataset[self.DATASET_FULL].to_pandas() - self.dataset = self.dataset.iloc[np.array(self.subset), :] + self.dataset = self.dataset_full.iloc[np.array(self.subset), :] self.hardness = hardness self.n_levels = len(self.hardness) self.n_reps = [n_reps if self.hardness[j] else 1 for j in range(self.n_levels)] # no need to repeat if it does not count into the efficiency score From afb847184e4bb6215e58cf6f1d89b87ab32e0092 Mon Sep 17 00:00:00 2001 From: q-rz <100142775+q-rz@users.noreply.github.com> Date: Sun, 21 Jul 2024 21:12:29 +0000 Subject: [PATCH 15/28] Add a new benchmark ENAMEL --- bigcode_eval/tasks/custom_metrics/enamel_eval.py | 2 +- bigcode_eval/tasks/enamel.py | 10 +++++++--- 2 files changed, 8 insertions(+), 4 deletions(-) diff --git a/bigcode_eval/tasks/custom_metrics/enamel_eval.py b/bigcode_eval/tasks/custom_metrics/enamel_eval.py index dbd67cb9d..ea8e550c2 100644 --- a/bigcode_eval/tasks/custom_metrics/enamel_eval.py +++ b/bigcode_eval/tasks/custom_metrics/enamel_eval.py @@ -59,7 +59,7 @@ def __init__(self, tests, hardness): self.lid = j self.cid = k -class Unpickler(pickle.Unpickler): +class EnamUnpickler(pickle.Unpickler): CLS_DICT = {'enam.evaluate.Test': Test, 'enam.evaluate.Refs': Refs} def find_class(self, module, name): if module in self.CLS_DICT: diff --git a/bigcode_eval/tasks/enamel.py b/bigcode_eval/tasks/enamel.py index 6772ac619..91e826629 100644 --- a/bigcode_eval/tasks/enamel.py +++ b/bigcode_eval/tasks/enamel.py @@ -17,12 +17,12 @@ """ -import pickle from warnings import warn +import pickle import numpy as np from huggingface_hub import hf_hub_download from bigcode_eval.tasks.humaneval import GeneralHumanEval -from bigcode_eval.tasks.custom_metrics.enamel_eval import Unpickler, evaluate_all, might_catch_timeout_signal +from bigcode_eval.tasks.custom_metrics.enamel_eval import EnamUnpickler, evaluate_all, might_catch_timeout_signal class GeneralENAMEL(GeneralHumanEval): @@ -48,8 +48,12 @@ def __init__(self, subset, # list of problem IDs self.memory_giga = memory_giga self.timeout_factor = timeout_factor self.tolerence_sec = tolerence_sec + if self.DATASET_PATH != 'q-rz/enamel': + warn(f"Tests are loaded from {self.DATASET_PATH}/{tests_path} by `pickle`. Unpickling files from an unknown provider can be unsafe.") self.tests_path = hf_hub_download(repo_id = self.DATASET_PATH, filename = tests_path, repo_type = "dataset") - # TODO: load tests from tests_path + with open(self.tests_path, 'rb') as fi: + self.tests_full = EnamUnpickler(fi).load() + self.tests = [self.tests_full[i] for i in self.subset] def get_dataset(self): """Returns dataset for the task or an iterable of any object, that get_prompt can handle""" From 8cf92a6cdce84dc4c3082c9add0467ca18e5738f Mon Sep 17 00:00:00 2001 From: q-rz <100142775+q-rz@users.noreply.github.com> Date: Sun, 21 Jul 2024 21:15:21 +0000 Subject: [PATCH 16/28] Add a new benchmark ENAMEL --- bigcode_eval/tasks/custom_metrics/enamel_eval.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/bigcode_eval/tasks/custom_metrics/enamel_eval.py b/bigcode_eval/tasks/custom_metrics/enamel_eval.py index ea8e550c2..9b786c0a8 100644 --- a/bigcode_eval/tasks/custom_metrics/enamel_eval.py +++ b/bigcode_eval/tasks/custom_metrics/enamel_eval.py @@ -62,8 +62,9 @@ def __init__(self, tests, hardness): class EnamUnpickler(pickle.Unpickler): CLS_DICT = {'enam.evaluate.Test': Test, 'enam.evaluate.Refs': Refs} def find_class(self, module, name): - if module in self.CLS_DICT: - return self.CLS_DICT[module] + cls_name = f'{module}.{name}' + if cls_name in self.CLS_DICT: + return self.CLS_DICT[cls_name] else: return super().find_class(module, name) From 71c69f6479420ef57809129e2f7412790eb34250 Mon Sep 17 00:00:00 2001 From: q-rz <100142775+q-rz@users.noreply.github.com> Date: Sun, 21 Jul 2024 22:22:43 +0000 Subject: [PATCH 17/28] Add a new benchmark ENAMEL --- bigcode_eval/tasks/enamel.py | 49 +++++++++++++++++++++++------------- 1 file changed, 32 insertions(+), 17 deletions(-) diff --git a/bigcode_eval/tasks/enamel.py b/bigcode_eval/tasks/enamel.py index 91e826629..8e40afc8f 100644 --- a/bigcode_eval/tasks/enamel.py +++ b/bigcode_eval/tasks/enamel.py @@ -22,7 +22,7 @@ import numpy as np from huggingface_hub import hf_hub_download from bigcode_eval.tasks.humaneval import GeneralHumanEval -from bigcode_eval.tasks.custom_metrics.enamel_eval import EnamUnpickler, evaluate_all, might_catch_timeout_signal +from bigcode_eval.tasks.custom_metrics.enamel_eval import EnamUnpickler, Dict, evaluate_all, might_catch_timeout_signal class GeneralENAMEL(GeneralHumanEval): @@ -32,44 +32,51 @@ class GeneralENAMEL(GeneralHumanEval): DATASET_PATH = "q-rz/enamel" DATASET_NAME = "default" - DATASET_FULL = "ENAMEL_HumanEval" + DATASET_ALL = "ENAMEL_HumanEval" def __init__(self, subset, # list of problem IDs hardness=[0., 3., 3., 4.], n_reps = 6, memory_giga=4., timeout_factor=2., tolerence_sec=0.01, tests_path="cache/eval~tests.pkl", strip_prompt=True, k=[1, 10, 100], ): super().__init__(strip_prompt=strip_prompt, k=k, num_workers=1, timeout=None) # each problem has a different time limit - self.subset = subset - self.dataset_full = self.dataset[self.DATASET_FULL].to_pandas() - self.dataset = self.dataset_full.iloc[np.array(self.subset), :] + self.subset = subset if isinstance(subset, list) else list(subset) + self.n_probs = len(self.subset) + self.dataset = self.dataset[self.DATASET_ALL].to_pandas().iloc[self.subset, :] self.hardness = hardness self.n_levels = len(self.hardness) self.n_reps = [n_reps if self.hardness[j] else 1 for j in range(self.n_levels)] # no need to repeat if it does not count into the efficiency score self.memory_giga = memory_giga self.timeout_factor = timeout_factor self.tolerence_sec = tolerence_sec + #warn(f"Problems here have been renumbered 0--{self.n_probs - 1} to compatibilize with `bigcode_eval`") if self.DATASET_PATH != 'q-rz/enamel': warn(f"Tests are loaded from {self.DATASET_PATH}/{tests_path} by `pickle`. Unpickling files from an unknown provider can be unsafe.") self.tests_path = hf_hub_download(repo_id = self.DATASET_PATH, filename = tests_path, repo_type = "dataset") with open(self.tests_path, 'rb') as fi: - self.tests_full = EnamUnpickler(fi).load() - self.tests = [self.tests_full[i] for i in self.subset] + tests_all, refs_all = EnamUnpickler(fi).load() + self.tests = [tests_all[i] for i in self.subset] + self.refs = [refs_all[i] for i in self.subset] def get_dataset(self): - """Returns dataset for the task or an iterable of any object, that get_prompt can handle""" - return self.dataset + """Returns dataset as an iterable of namedtuple""" + return self.dataset.itertuples(index=True) - #TODO get_prompt + def get_prompt(self, doc): + """ + :param doc: namedtuple + a row from the dataset + :return: str + """ + return super().get_prompt(doc._asdict()) def get_reference(self, doc): - # TODO: get the reference solution from a sample `doc` from the dataset """ - Builds the reference solution for the doc (sample from the test dataset). - :param doc: dict{str: str} - sample from the test dataset - :return: str + :param doc: namedtuple + a row from the dataset + :return: tuple (problem, tests, refs) """ - return "" # TODO: include tests + i = doc.Index + return (doc, self.tests[i], self.refs[i]) def postprocess_generation(self, generation, idx): """ @@ -96,8 +103,16 @@ def process_results(self, generations, references): list of str containing refrences :return: dict[str: float] """ + problems = [] + tests = [] + refs = [] + for problem, tests_i, refs_i in references: + problems.append(problem) + tests.append(tests_i) + refs.append(refs_i) return evaluate_all( - generations, references, k=self.k, hardness=self.hardness, n_reps=self.n_reps, + problems=problems, codes=generations, tests=tests, refs=refs, + k=self.k, hardness=self.hardness, n_reps=self.n_reps, memory_giga=self.memory_giga, timeout_factor=self.timeout_factor, tolerence_sec=self.tolerence_sec, ) From 32265c8e230dbc9a63b702154070370e3e92fabf Mon Sep 17 00:00:00 2001 From: q-rz <100142775+q-rz@users.noreply.github.com> Date: Sun, 21 Jul 2024 22:32:13 +0000 Subject: [PATCH 18/28] Add a new benchmark ENAMEL --- bigcode_eval/tasks/enamel.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bigcode_eval/tasks/enamel.py b/bigcode_eval/tasks/enamel.py index 8e40afc8f..762faff75 100644 --- a/bigcode_eval/tasks/enamel.py +++ b/bigcode_eval/tasks/enamel.py @@ -22,7 +22,7 @@ import numpy as np from huggingface_hub import hf_hub_download from bigcode_eval.tasks.humaneval import GeneralHumanEval -from bigcode_eval.tasks.custom_metrics.enamel_eval import EnamUnpickler, Dict, evaluate_all, might_catch_timeout_signal +from bigcode_eval.tasks.custom_metrics.enamel_eval import EnamUnpickler, evaluate_all, might_catch_timeout_signal class GeneralENAMEL(GeneralHumanEval): From bef756648193448eb4eb25468e2f1a0462981de4 Mon Sep 17 00:00:00 2001 From: q-rz <100142775+q-rz@users.noreply.github.com> Date: Sun, 21 Jul 2024 23:49:27 +0000 Subject: [PATCH 19/28] Add a new benchmark ENAMEL --- bigcode_eval/tasks/enamel.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bigcode_eval/tasks/enamel.py b/bigcode_eval/tasks/enamel.py index 762faff75..34207e70c 100644 --- a/bigcode_eval/tasks/enamel.py +++ b/bigcode_eval/tasks/enamel.py @@ -59,7 +59,7 @@ def __init__(self, subset, # list of problem IDs def get_dataset(self): """Returns dataset as an iterable of namedtuple""" - return self.dataset.itertuples(index=True) + return list(self.dataset.itertuples(index=True)) def get_prompt(self, doc): """ From bf3348f0654a776482e7c1ae66196c6fdf8b5fd9 Mon Sep 17 00:00:00 2001 From: q-rz <100142775+q-rz@users.noreply.github.com> Date: Sun, 21 Jul 2024 23:53:19 +0000 Subject: [PATCH 20/28] Add a new benchmark ENAMEL --- bigcode_eval/tasks/custom_metrics/enamel_eval.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bigcode_eval/tasks/custom_metrics/enamel_eval.py b/bigcode_eval/tasks/custom_metrics/enamel_eval.py index 9b786c0a8..9bbf57293 100644 --- a/bigcode_eval/tasks/custom_metrics/enamel_eval.py +++ b/bigcode_eval/tasks/custom_metrics/enamel_eval.py @@ -91,7 +91,7 @@ def evaluate_one(code, problem, tests, refs, k, hardness, n_reps, memory_giga, t level_break = False for k, test in enumerate(tests_j): elapsed = [None for rep in range(n_reps_j)] - for rep in range(n_reps): + for rep in range(n_reps_j): scope = dict(time = time, input = None, print = None, __input = deepcopy(test.input)) # in case that the code modifies the input try: unsafe_timed_execute(TPL_RUN % (problem.prompt, code, problem.entry_point), scope, memory_bytes, timeout + tolerence_sec) From 93c47cc4990d8edb861f09f8d520513d644fd573 Mon Sep 17 00:00:00 2001 From: q-rz <100142775+q-rz@users.noreply.github.com> Date: Mon, 22 Jul 2024 00:25:33 +0000 Subject: [PATCH 21/28] Add a new benchmark ENAMEL --- bigcode_eval/tasks/custom_metrics/enamel_eval.py | 16 ++++++++-------- bigcode_eval/tasks/enamel.py | 5 +++-- 2 files changed, 11 insertions(+), 10 deletions(-) diff --git a/bigcode_eval/tasks/custom_metrics/enamel_eval.py b/bigcode_eval/tasks/custom_metrics/enamel_eval.py index 9bbf57293..50f9943bd 100644 --- a/bigcode_eval/tasks/custom_metrics/enamel_eval.py +++ b/bigcode_eval/tasks/custom_metrics/enamel_eval.py @@ -83,8 +83,9 @@ def find_class(self, module, name): def evaluate_one(code, problem, tests, refs, k, hardness, n_reps, memory_giga, timeout_factor, tolerence_sec, time_correction): timeout = timeout_factor * refs.ref_max memory_bytes = memory_giga * (1024 ** 3) + n_levels = len(tests) + zero_effs = [0. for j in range(n_levels)] effs = [] - elapsed_list = [] for j, (size, tests_j) in enumerate(tests): n_reps_j = n_reps[j] level_elapsed = [] @@ -110,26 +111,25 @@ def evaluate_one(code, problem, tests, refs, k, hardness, n_reps, memory_giga, t except KeyboardInterrupt as e: raise e except BaseException as e: - return False, self.zero_effs(), elapsed_list + return False, zero_effs else: if '__accepted' in scope and scope['__accepted']: elapsed[rep] = scope['__t1'] - scope['__t0'] else: - return False, self.zero_effs(), elapsed_list + return False, zero_effs if level_break: break else: level_elapsed.append(calc_exec_time(elapsed).item() * time_correction) - elapsed_list.append(level_elapsed) if level_break: break else: effs.append(calc_eff(elapsed = max(level_elapsed), ref = refs.refs[j], timeout = timeout)) if j == 0 and level_break: - return False, self.zero_effs(), elapsed_list - for j in range(len(effs), self.n_levels): + return False, zero_effs + for j in range(len(effs), n_levels): effs.append(0.) - return True, effs, elapsed_list + return True, effs def get_time_correction(problem, tests, refs, n_reps): # computes the calibration factor of of execution time j = refs.lid @@ -158,7 +158,7 @@ def evaluate_all(problems, codes, tests, refs, k, hardness, n_reps, memory_giga, problem_passes = [] problem_effs = [] for code in codes_i: - passed, code_effs, code_elapsed = evaluate_one( + passed, code_effs = evaluate_one( code = code, problem = problem, tests = tests_i, refs = refs_i, k = k, hardness = hardness, n_reps = n_reps, memory_giga = memory_giga, timeout_factor = timeout_factor, tolerence_sec = tolerence_sec, time_correction = time_correction) diff --git a/bigcode_eval/tasks/enamel.py b/bigcode_eval/tasks/enamel.py index 34207e70c..6d2c4283f 100644 --- a/bigcode_eval/tasks/enamel.py +++ b/bigcode_eval/tasks/enamel.py @@ -42,6 +42,7 @@ def __init__(self, subset, # list of problem IDs self.subset = subset if isinstance(subset, list) else list(subset) self.n_probs = len(self.subset) self.dataset = self.dataset[self.DATASET_ALL].to_pandas().iloc[self.subset, :] + self.prob_ids = {row.task_id: i for i, row in enumerate(self.dataset.itertuples(index=False))} self.hardness = hardness self.n_levels = len(self.hardness) self.n_reps = [n_reps if self.hardness[j] else 1 for j in range(self.n_levels)] # no need to repeat if it does not count into the efficiency score @@ -59,7 +60,7 @@ def __init__(self, subset, # list of problem IDs def get_dataset(self): """Returns dataset as an iterable of namedtuple""" - return list(self.dataset.itertuples(index=True)) + return list(self.dataset.itertuples(index=False)) def get_prompt(self, doc): """ @@ -75,7 +76,7 @@ def get_reference(self, doc): a row from the dataset :return: tuple (problem, tests, refs) """ - i = doc.Index + i = self.prob_ids[doc.task_id] return (doc, self.tests[i], self.refs[i]) def postprocess_generation(self, generation, idx): From 6b6163da133b2103263f672491aa5935252a352e Mon Sep 17 00:00:00 2001 From: q-rz <100142775+q-rz@users.noreply.github.com> Date: Mon, 22 Jul 2024 00:36:02 +0000 Subject: [PATCH 22/28] Add a new benchmark ENAMEL --- bigcode_eval/tasks/custom_metrics/enamel_eval.py | 1 + 1 file changed, 1 insertion(+) diff --git a/bigcode_eval/tasks/custom_metrics/enamel_eval.py b/bigcode_eval/tasks/custom_metrics/enamel_eval.py index 50f9943bd..f5a9c0764 100644 --- a/bigcode_eval/tasks/custom_metrics/enamel_eval.py +++ b/bigcode_eval/tasks/custom_metrics/enamel_eval.py @@ -167,6 +167,7 @@ def evaluate_all(problems, codes, tests, refs, k, hardness, n_reps, memory_giga, for j, k_ in enumerate(k): passes[j].append(calc_pass_at_k(n = len(problem_passes), c = sum(problem_passes), k = k_)) effs[j].append(calc_eff_at_k(e = np.average(problem_effs, axis = 1, weights = hardness), k = k_)) + if effs[j][-1] < 0.98: print(f'{problem.task_id}: eff={effs[j][-1]:.4f}', flush = True) metrics = dict() for k_, pass_k in zip(k, passes): metrics[f'pass@{k_}'] = np.mean(pass_k).item() From fd7694fbc6430ede2a56f78e9a4d6466c4865dfe Mon Sep 17 00:00:00 2001 From: q-rz <100142775+q-rz@users.noreply.github.com> Date: Mon, 22 Jul 2024 03:47:11 +0000 Subject: [PATCH 23/28] Add a new benchmark ENAMEL --- bigcode_eval/tasks/custom_metrics/enamel_eval.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/bigcode_eval/tasks/custom_metrics/enamel_eval.py b/bigcode_eval/tasks/custom_metrics/enamel_eval.py index f5a9c0764..57f7b943e 100644 --- a/bigcode_eval/tasks/custom_metrics/enamel_eval.py +++ b/bigcode_eval/tasks/custom_metrics/enamel_eval.py @@ -81,7 +81,7 @@ def find_class(self, module, name): ''' # % (prompt, checker) def evaluate_one(code, problem, tests, refs, k, hardness, n_reps, memory_giga, timeout_factor, tolerence_sec, time_correction): - timeout = timeout_factor * refs.ref_max + timeout = timeout_factor * refs.ref_max / time_correction memory_bytes = memory_giga * (1024 ** 3) n_levels = len(tests) zero_effs = [0. for j in range(n_levels)] @@ -100,9 +100,11 @@ def evaluate_one(code, problem, tests, refs, k, hardness, n_reps, memory_giga, t scope['__answer'] = test.answer # to prevent the code reading the answer unsafe_execute(TPL_TEST % (problem.prompt, problem.checker), scope) # assuming that the checker does not modify the input except TimeoutException as e: + print(f'TLE: {problem.task_id} level={j} case={k}')########## level_break = True break except MemoryError as e: + print(f'MLE: {problem.task_id} level={j} case={k}')########## level_break = True break except OverflowError as e: @@ -167,7 +169,7 @@ def evaluate_all(problems, codes, tests, refs, k, hardness, n_reps, memory_giga, for j, k_ in enumerate(k): passes[j].append(calc_pass_at_k(n = len(problem_passes), c = sum(problem_passes), k = k_)) effs[j].append(calc_eff_at_k(e = np.average(problem_effs, axis = 1, weights = hardness), k = k_)) - if effs[j][-1] < 0.98: print(f'{problem.task_id}: eff={effs[j][-1]:.4f}', flush = True) + if abs(effs[j][-1] - 1.) > 0.03: print(f'{problem.task_id}: eff={effs[j][-1]:.4f} c={time_correction:.4f}', flush = True)############## metrics = dict() for k_, pass_k in zip(k, passes): metrics[f'pass@{k_}'] = np.mean(pass_k).item() From cd0810c06d9c8aa21051d4294624b58f872849ac Mon Sep 17 00:00:00 2001 From: q-rz <100142775+q-rz@users.noreply.github.com> Date: Mon, 22 Jul 2024 04:57:22 +0000 Subject: [PATCH 24/28] Add a new benchmark ENAMEL --- bigcode_eval/tasks/custom_metrics/enamel_eval.py | 11 +++++++---- bigcode_eval/tasks/enamel.py | 3 +-- 2 files changed, 8 insertions(+), 6 deletions(-) diff --git a/bigcode_eval/tasks/custom_metrics/enamel_eval.py b/bigcode_eval/tasks/custom_metrics/enamel_eval.py index 57f7b943e..d51b01921 100644 --- a/bigcode_eval/tasks/custom_metrics/enamel_eval.py +++ b/bigcode_eval/tasks/custom_metrics/enamel_eval.py @@ -81,7 +81,8 @@ def find_class(self, module, name): ''' # % (prompt, checker) def evaluate_one(code, problem, tests, refs, k, hardness, n_reps, memory_giga, timeout_factor, tolerence_sec, time_correction): - timeout = timeout_factor * refs.ref_max / time_correction + timeout = timeout_factor * refs.ref_max + time_limit = timeout / min(time_correction, 1.) memory_bytes = memory_giga * (1024 ** 3) n_levels = len(tests) zero_effs = [0. for j in range(n_levels)] @@ -95,16 +96,16 @@ def evaluate_one(code, problem, tests, refs, k, hardness, n_reps, memory_giga, t for rep in range(n_reps_j): scope = dict(time = time, input = None, print = None, __input = deepcopy(test.input)) # in case that the code modifies the input try: - unsafe_timed_execute(TPL_RUN % (problem.prompt, code, problem.entry_point), scope, memory_bytes, timeout + tolerence_sec) + unsafe_timed_execute(TPL_RUN % (problem.prompt, code, problem.entry_point), scope, memory_bytes, time_limit + tolerence_sec) scope['__input'] = test.input scope['__answer'] = test.answer # to prevent the code reading the answer unsafe_execute(TPL_TEST % (problem.prompt, problem.checker), scope) # assuming that the checker does not modify the input except TimeoutException as e: - print(f'TLE: {problem.task_id} level={j} case={k}')########## + print(f'TLE {problem.task_id} level={j} case={k}')########## level_break = True break except MemoryError as e: - print(f'MLE: {problem.task_id} level={j} case={k}')########## + print(f'MLE {problem.task_id} level={j} case={k}')########## level_break = True break except OverflowError as e: @@ -113,11 +114,13 @@ def evaluate_one(code, problem, tests, refs, k, hardness, n_reps, memory_giga, t except KeyboardInterrupt as e: raise e except BaseException as e: + print(f'RE {problem.task_id} level={j} case={k} {type(e)} {e}')########## return False, zero_effs else: if '__accepted' in scope and scope['__accepted']: elapsed[rep] = scope['__t1'] - scope['__t0'] else: + print(f'WA {problem.task_id} level={j} case={k} {type(e)} {e}')########## return False, zero_effs if level_break: break diff --git a/bigcode_eval/tasks/enamel.py b/bigcode_eval/tasks/enamel.py index 6d2c4283f..820dd5cb9 100644 --- a/bigcode_eval/tasks/enamel.py +++ b/bigcode_eval/tasks/enamel.py @@ -49,7 +49,6 @@ def __init__(self, subset, # list of problem IDs self.memory_giga = memory_giga self.timeout_factor = timeout_factor self.tolerence_sec = tolerence_sec - #warn(f"Problems here have been renumbered 0--{self.n_probs - 1} to compatibilize with `bigcode_eval`") if self.DATASET_PATH != 'q-rz/enamel': warn(f"Tests are loaded from {self.DATASET_PATH}/{tests_path} by `pickle`. Unpickling files from an unknown provider can be unsafe.") self.tests_path = hf_hub_download(repo_id = self.DATASET_PATH, filename = tests_path, repo_type = "dataset") @@ -85,7 +84,7 @@ def postprocess_generation(self, generation, idx): :param generation: str code generation from LM :param idx: int (if needed) - index of doc in the dataset to which the generation belongs + index of doc in the dataset to which the generation belongs; not needed here :return: str """ generation = self._stop_at_stop_token(generation, self.stop_words) From de62d2048139d5a19eec5e5d0951f0d929cefae9 Mon Sep 17 00:00:00 2001 From: q-rz <100142775+q-rz@users.noreply.github.com> Date: Mon, 22 Jul 2024 05:04:19 +0000 Subject: [PATCH 25/28] Add a new benchmark ENAMEL --- bigcode_eval/tasks/enamel.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bigcode_eval/tasks/enamel.py b/bigcode_eval/tasks/enamel.py index 820dd5cb9..5250fa272 100644 --- a/bigcode_eval/tasks/enamel.py +++ b/bigcode_eval/tasks/enamel.py @@ -35,7 +35,7 @@ class GeneralENAMEL(GeneralHumanEval): DATASET_ALL = "ENAMEL_HumanEval" def __init__(self, subset, # list of problem IDs - hardness=[0., 3., 3., 4.], n_reps = 6, memory_giga=4., timeout_factor=2., tolerence_sec=0.01, tests_path="cache/eval~tests.pkl", + hardness=[0., 3., 3., 4.], n_reps = 6, memory_giga=8., timeout_factor=2., tolerence_sec=0.01, tests_path="cache/eval~tests.pkl", strip_prompt=True, k=[1, 10, 100], ): super().__init__(strip_prompt=strip_prompt, k=k, num_workers=1, timeout=None) # each problem has a different time limit From 80f4e147dcd2ad13876ef5870b5c59abdabcf0b7 Mon Sep 17 00:00:00 2001 From: q-rz <100142775+q-rz@users.noreply.github.com> Date: Mon, 22 Jul 2024 05:17:46 +0000 Subject: [PATCH 26/28] Add a new benchmark ENAMEL --- .../tasks/custom_metrics/enamel_eval.py | 47 +++++++++---------- bigcode_eval/tasks/enamel.py | 15 +++--- 2 files changed, 27 insertions(+), 35 deletions(-) diff --git a/bigcode_eval/tasks/custom_metrics/enamel_eval.py b/bigcode_eval/tasks/custom_metrics/enamel_eval.py index d51b01921..7cf1b82e8 100644 --- a/bigcode_eval/tasks/custom_metrics/enamel_eval.py +++ b/bigcode_eval/tasks/custom_metrics/enamel_eval.py @@ -80,9 +80,8 @@ def find_class(self, module, name): __accepted = __check(__input, __answer, __output) ''' # % (prompt, checker) -def evaluate_one(code, problem, tests, refs, k, hardness, n_reps, memory_giga, timeout_factor, tolerence_sec, time_correction): +def evaluate_one(code, problem, tests, refs, k, hardness, n_reps, memory_giga, timeout_factor, tolerence_sec): timeout = timeout_factor * refs.ref_max - time_limit = timeout / min(time_correction, 1.) memory_bytes = memory_giga * (1024 ** 3) n_levels = len(tests) zero_effs = [0. for j in range(n_levels)] @@ -96,16 +95,14 @@ def evaluate_one(code, problem, tests, refs, k, hardness, n_reps, memory_giga, t for rep in range(n_reps_j): scope = dict(time = time, input = None, print = None, __input = deepcopy(test.input)) # in case that the code modifies the input try: - unsafe_timed_execute(TPL_RUN % (problem.prompt, code, problem.entry_point), scope, memory_bytes, time_limit + tolerence_sec) + unsafe_timed_execute(TPL_RUN % (problem.prompt, code, problem.entry_point), scope, memory_bytes, timeout + tolerence_sec) scope['__input'] = test.input scope['__answer'] = test.answer # to prevent the code reading the answer unsafe_execute(TPL_TEST % (problem.prompt, problem.checker), scope) # assuming that the checker does not modify the input except TimeoutException as e: - print(f'TLE {problem.task_id} level={j} case={k}')########## level_break = True break except MemoryError as e: - print(f'MLE {problem.task_id} level={j} case={k}')########## level_break = True break except OverflowError as e: @@ -114,18 +111,16 @@ def evaluate_one(code, problem, tests, refs, k, hardness, n_reps, memory_giga, t except KeyboardInterrupt as e: raise e except BaseException as e: - print(f'RE {problem.task_id} level={j} case={k} {type(e)} {e}')########## return False, zero_effs else: if '__accepted' in scope and scope['__accepted']: elapsed[rep] = scope['__t1'] - scope['__t0'] else: - print(f'WA {problem.task_id} level={j} case={k} {type(e)} {e}')########## return False, zero_effs if level_break: break else: - level_elapsed.append(calc_exec_time(elapsed).item() * time_correction) + level_elapsed.append(calc_exec_time(elapsed).item()) if level_break: break else: @@ -136,20 +131,21 @@ def evaluate_one(code, problem, tests, refs, k, hardness, n_reps, memory_giga, t effs.append(0.) return True, effs -def get_time_correction(problem, tests, refs, n_reps): # computes the calibration factor of of execution time - j = refs.lid - k = refs.cid - test = tests[j][-1][k] - n_reps_j = n_reps[j] - elapsed = [None for rep in range(n_reps_j)] - for rep in range(n_reps_j): - scope = dict(time = time, __input = deepcopy(test.input)) # in case that the code modifies the input - unsafe_execute(TPL_RUN % (problem.prompt, problem.reference_solution, problem.entry_point), scope) # assuming that the reference solution is error-free - elapsed[rep] = scope['__t1'] - scope['__t0'] - elapsed = calc_exec_time(elapsed).item() - return refs.ref_max / elapsed - -def evaluate_all(problems, codes, tests, refs, k, hardness, n_reps, memory_giga, timeout_factor, tolerence_sec): +def compute_refs(problem, tests, n_reps, hardness): # computes the calibration factor of of execution time + for j in range(len(tests)): + if hardness[j]: + for k in range(len(tests[j][-1])): + test = tests[j][-1][k] + n_reps_j = n_reps[j] + elapsed = [None for rep in range(n_reps_j)] + for rep in range(n_reps_j): + scope = dict(time = time, __input = deepcopy(test.input)) # in case that the code modifies the input + unsafe_execute(TPL_RUN % (problem.prompt, problem.reference_solution, problem.entry_point), scope) # assuming that the reference solution is error-free + elapsed[rep] = scope['__t1'] - scope['__t0'] + test.ref = calc_exec_time(elapsed).item() + return Refs(tests = tests, hardness = hardness) + +def evaluate_all(problems, codes, tests, k, hardness, n_reps, memory_giga, timeout_factor, tolerence_sec): if isinstance(k, int): k = [k] min_codes = min(len(codes_i) for codes_i in codes) @@ -157,8 +153,8 @@ def evaluate_all(problems, codes, tests, refs, k, hardness, n_reps, memory_giga, passes = [[] for k_ in k] effs = [[] for k_ in k] gc.collect() - for problem, codes_i, tests_i, refs_i in zip(problems, codes, tests, refs): - time_correction = get_time_correction(problem = problem, tests = tests_i, refs = refs_i, n_reps = n_reps) + for problem, codes_i, tests_i in zip(problems, codes, tests): + refs_i = compute_refs(problem = problem, tests = tests_i, n_reps = n_reps, hardness = hardness) n_levels = len(tests_i) problem_passes = [] problem_effs = [] @@ -166,13 +162,12 @@ def evaluate_all(problems, codes, tests, refs, k, hardness, n_reps, memory_giga, passed, code_effs = evaluate_one( code = code, problem = problem, tests = tests_i, refs = refs_i, k = k, hardness = hardness, n_reps = n_reps, memory_giga = memory_giga, - timeout_factor = timeout_factor, tolerence_sec = tolerence_sec, time_correction = time_correction) + timeout_factor = timeout_factor, tolerence_sec = tolerence_sec) problem_passes.append(passed) problem_effs.append(code_effs) for j, k_ in enumerate(k): passes[j].append(calc_pass_at_k(n = len(problem_passes), c = sum(problem_passes), k = k_)) effs[j].append(calc_eff_at_k(e = np.average(problem_effs, axis = 1, weights = hardness), k = k_)) - if abs(effs[j][-1] - 1.) > 0.03: print(f'{problem.task_id}: eff={effs[j][-1]:.4f} c={time_correction:.4f}', flush = True)############## metrics = dict() for k_, pass_k in zip(k, passes): metrics[f'pass@{k_}'] = np.mean(pass_k).item() diff --git a/bigcode_eval/tasks/enamel.py b/bigcode_eval/tasks/enamel.py index 5250fa272..0ebcce402 100644 --- a/bigcode_eval/tasks/enamel.py +++ b/bigcode_eval/tasks/enamel.py @@ -35,7 +35,7 @@ class GeneralENAMEL(GeneralHumanEval): DATASET_ALL = "ENAMEL_HumanEval" def __init__(self, subset, # list of problem IDs - hardness=[0., 3., 3., 4.], n_reps = 6, memory_giga=8., timeout_factor=2., tolerence_sec=0.01, tests_path="cache/eval~tests.pkl", + hardness=[0., 3., 3., 4.], n_reps = 10, memory_giga=10., timeout_factor=2., tolerence_sec=0.01, tests_path="cache/eval~tests.pkl", strip_prompt=True, k=[1, 10, 100], ): super().__init__(strip_prompt=strip_prompt, k=k, num_workers=1, timeout=None) # each problem has a different time limit @@ -53,9 +53,8 @@ def __init__(self, subset, # list of problem IDs warn(f"Tests are loaded from {self.DATASET_PATH}/{tests_path} by `pickle`. Unpickling files from an unknown provider can be unsafe.") self.tests_path = hf_hub_download(repo_id = self.DATASET_PATH, filename = tests_path, repo_type = "dataset") with open(self.tests_path, 'rb') as fi: - tests_all, refs_all = EnamUnpickler(fi).load() + tests_all, _ = EnamUnpickler(fi).load() self.tests = [tests_all[i] for i in self.subset] - self.refs = [refs_all[i] for i in self.subset] def get_dataset(self): """Returns dataset as an iterable of namedtuple""" @@ -73,10 +72,10 @@ def get_reference(self, doc): """ :param doc: namedtuple a row from the dataset - :return: tuple (problem, tests, refs) + :return: tuple (problem, tests) """ i = self.prob_ids[doc.task_id] - return (doc, self.tests[i], self.refs[i]) + return doc, self.tests[i] def postprocess_generation(self, generation, idx): """ @@ -105,13 +104,11 @@ def process_results(self, generations, references): """ problems = [] tests = [] - refs = [] - for problem, tests_i, refs_i in references: + for problem, tests_i in references: problems.append(problem) tests.append(tests_i) - refs.append(refs_i) return evaluate_all( - problems=problems, codes=generations, tests=tests, refs=refs, + problems=problems, codes=generations, tests=tests, k=self.k, hardness=self.hardness, n_reps=self.n_reps, memory_giga=self.memory_giga, timeout_factor=self.timeout_factor, tolerence_sec=self.tolerence_sec, ) From 71094336479a59b1d127b46c553bf4ed5ff9eaed Mon Sep 17 00:00:00 2001 From: q-rz <100142775+q-rz@users.noreply.github.com> Date: Mon, 22 Jul 2024 05:22:22 +0000 Subject: [PATCH 27/28] Add a new benchmark ENAMEL --- bigcode_eval/tasks/enamel.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bigcode_eval/tasks/enamel.py b/bigcode_eval/tasks/enamel.py index 0ebcce402..35c7be212 100644 --- a/bigcode_eval/tasks/enamel.py +++ b/bigcode_eval/tasks/enamel.py @@ -35,7 +35,7 @@ class GeneralENAMEL(GeneralHumanEval): DATASET_ALL = "ENAMEL_HumanEval" def __init__(self, subset, # list of problem IDs - hardness=[0., 3., 3., 4.], n_reps = 10, memory_giga=10., timeout_factor=2., tolerence_sec=0.01, tests_path="cache/eval~tests.pkl", + hardness=[0., 3., 3., 4.], n_reps = 6, memory_giga=10., timeout_factor=2., tolerence_sec=0.01, tests_path="cache/eval~tests.pkl", strip_prompt=True, k=[1, 10, 100], ): super().__init__(strip_prompt=strip_prompt, k=k, num_workers=1, timeout=None) # each problem has a different time limit From cca02b2431bee950cd5248272394a58d3d264325 Mon Sep 17 00:00:00 2001 From: Ruizhong Qiu <100142775+q-rz@users.noreply.github.com> Date: Sun, 21 Jul 2024 23:13:18 -0700 Subject: [PATCH 28/28] Update README.md --- docs/README.md | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/docs/README.md b/docs/README.md index 903c6a122..e652061bf 100644 --- a/docs/README.md +++ b/docs/README.md @@ -426,6 +426,27 @@ accelerate launch main.py \ --metric_output_path .json ``` +## ENAMEL + +[ENAMEL](https://github.com/q-rz/enamel) is a rigorous & high-standard benchmark for evaluating the efficiency of generated Python code under large-scale inputs. It supports a new efficiency metric called eff@k, which generalizes the pass@k metric. Besides that, it provides expert-written reference solutions and expert-written test case generators, thus setting a high-standard for efficiency evaluation. See [this paper](https://arxiv.org/abs/2406.06647) for detail. + +**Notice:** It is NOT recommended to use multiple threads or processes in efficiency evaluation. That can negatively affect efficiency results. + +```python +accelerate launch main.py \ + --model \ + --max_length_generation 2048 \ + --tasks enamel \ + --temperature 0.8 \ + --top_p 0.95 \ + --do_sample True \ + --n_samples 10 \ + --batch_size 10 \ + --allow_code_execution +``` + +This implementation also supports the two subsets Algo and Impl in the paper: `--task enamel-algo` / `--task enamel-impl`. + ## Code generation benchmarks without unit tests For these tasks, we do single generations and compare the generated code against reference solutions and compute BLEU score. For the following tasks, we use a two-shot setting where we include 2 inputs and their solutions in the prompt, all preceded by an instruction such as: ` "Answer the following instructions in a one line SQL query:\n"`. The solutions consist of one line so we stop the generation when a new line is generated. 3 languages are present: Python, SQL and Java.