From 4eadff52bc23db79e75c622711c000c05c9b7f19 Mon Sep 17 00:00:00 2001
From: q-rz <100142775+q-rz@users.noreply.github.com>
Date: Sun, 30 Jun 2024 23:19:08 +0000
Subject: [PATCH 01/28] Add a new benchmark ENAMEL

---
 README.md                    |  1 +
 bigcode_eval/tasks/enamel.py | 90 ++++++++++++++++++++++++++++++++++++
 2 files changed, 91 insertions(+)
 create mode 100644 bigcode_eval/tasks/enamel.py
diff --git a/README.md b/README.md
index aa3bb89e3..20941d607 100644
--- a/README.md
+++ b/README.md
@@ -39,6 +39,7 @@ Below are the features and tasks of this framework:
         - `StarCoderFIM`: which uses the default FIM tokens `"<fim_prefix>", "<fim_middle>", "<fim_suffix>"`, and
         - `SantaCoderFIM`: which uses SantaCoder FIM tokens `"<fim-prefix>", "<fim-middle>", "<fim-suffix>"`
     - [Mercury](https://huggingface.co/datasets/Elfsong/Mercury) for evaluating computational efficiency of **Python** code generation.
+    - [ENAMEL](https://github.com/q-rz/enamel) evaluates the efficiency ($\textnormal{eff@}k$) of generated code compared with **expert-written** reference solutions under HumanEval problems.
 
 More details about each task can be found in  the documentation in [`docs/README.md`](https://github.com/bigcode-project/bigcode-evaluation-harness/blob/main/docs/README.md).
 ## Setup
diff --git a/bigcode_eval/tasks/enamel.py b/bigcode_eval/tasks/enamel.py
new file mode 100644
index 000000000..357d8eabc
--- /dev/null
+++ b/bigcode_eval/tasks/enamel.py
@@ -0,0 +1,90 @@
+"""How efficient is LLM-generated code? A rigorous & high-standard benchmark
+https://arxiv.org/pdf/2406.06647
+
+ENAMEL is a rigorous & high-standard benchmark for evaluating the efficiency of generated code
+compared with **expert-written** reference solutions under 142 HumanEval problems
+
+Homepage: https://github.com/q-rz/enamel
+"""
+
+from warnings import warn
+from bigcode_eval.humaneval import GeneralHumanEval
+
+_CITATION = """
+@article{qiu2024enamel,
+  title={How efficient is {LLM}-generated code? A rigorous \& high-standard benchmark},
+  author={Qiu, Ruizhong and Zeng, Weiliang Will and Tong, Hanghang and Ezick, James and Lott, Christopher},
+  journal={arXiv preprint arXiv:2406.06647},
+  year={2024}
+}
+"""
+
+
+class ENAMEL(GeneralHumanEval):
+    """A task represents an entire benchmark including its dataset, problems,
+    answers, generation settings and evaluation methods.
+    """
+
+    DATASET_PATH = "q-rz/enamel"
+    DATASET_NAME = None
+
+    def __init__(self, strip_prompt, k=[1, 10, 100], num_workers=16, timeout_factor=): # TODO
+        super().__init__(strip_prompt=strip_prompt, k=k, num_workers=num_workers, timeout=None)
+        # TODO
+
+    def get_dataset(self):
+        # TODO: retrieve the evaluation subset from the loaded dataset (e.g. `self.dataset["test"]`)
+        """Returns dataset for the task or an iterable of any object, that get_prompt can handle"""
+        return []
+
+    def fewshot_examples(self):
+        # TODO: load few-shot examples (from bigcode_eval/tasks/fewshot_examples) if they exist
+        """Loads and returns the few-shot examples for the task if they exist."""
+        pass
+
+    def get_prompt(self, doc):
+        # TODO: build the prompt for the language model from a sample `doc` from the dataset
+        """
+        Builds the prompt for the LM to generate from.
+        :param doc: dict[str: str]
+            sample from the test dataset
+        :return: str
+        """
+        return ""
+
+    def get_reference(self, doc):
+        # TODO: get the reference solution from a sample `doc` from the dataset
+        """
+        Builds the reference solution for the doc (sample from the test dataset).
+        :param doc: dict[str: str]
+            sample from the test dataset
+        :return: str
+        """
+        return ""
+
+    def postprocess_generation(self, generation, idx):
+        # TODO: define the postprocessing for the LM generation
+        """
+        Defines the postprocessing for a LM generation.
+        :param generation: str
+            code generation from LM
+        :param idx: int (if needed)
+            index of doc in the dataset to which the generation belongs
+        :return: str
+        """
+        return ""
+
+    def process_results(self, generations, references):
+        # TODO: define how the evaluation score is computed from list of \
+        # generations and reference solutions
+        """
+        Takes the list of LM generations and evaluates them against ground truth references,
+        returning the metric for the generations as in {"metric_name": result}.
+        We encourage to directly load the metric from `evaluate` library to keep the code concise.
+        :param generations: list(list(str))
+            list of lists containing generations
+        :param references: list(str)
+            list of str containing refrences
+        :return: dict[str: float]
+        """
+        return {}

From d23b9385cacd128d00da2e97eead7d7c2ce8bddf Mon Sep 17 00:00:00 2001
From: q-rz <100142775+q-rz@users.noreply.github.com>
Date: Mon, 1 Jul 2024 00:13:08 +0000
Subject: [PATCH 02/28] Add a new benchmark ENAMEL

---
 bigcode_eval/tasks/enamel.py | 28 ++++++++++++++++++++++++----
 1 file changed, 24 insertions(+), 4 deletions(-)

diff --git a/bigcode_eval/tasks/enamel.py b/bigcode_eval/tasks/enamel.py
index 357d8eabc..cd9cb6d67 100644
--- a/bigcode_eval/tasks/enamel.py
+++ b/bigcode_eval/tasks/enamel.py
@@ -26,11 +26,31 @@ class ENAMEL(GeneralHumanEval):
     """
 
     DATASET_PATH = "q-rz/enamel"
-    DATASET_NAME = None
+    DATASET_NAME = "ENAMEL_HumanEval"
+    DATASET_SUBSETS = {
+        "enamel": sorted(set(range(164)) - {2, 23, 41, 45, 53, 60, 71, 92, 97, 99, 102, 123, 124, 135, 137, 138, 144, 148, 156, 157, 159, 160}),
+        "enamel-algo": [10, 18, 36, 39, 40, 43, 46, 49, 55, 59, 63, 76, 83, 96, 107, 109, 114, 129, 147, 154],
+        "enamel-impl": [1, 5, 8, 9, 11, 12, 15, 16, 17, 19, 21, 22, 24, 25, 26, 27, 31, 33, 37, 38, 44, 48, 49, 50, 51, 52, 56, 57, 58, 59, 61, 64, 66, 69, 70, 72, 73, 74, 75, 78, 80, 82, 85, 87, 89, 91, 93, 94, 95, 96, 98, 100, 104, 105, 108, 110, 111, 112, 113, 116, 117, 118, 121, 122, 125, 127, 128, 131, 140, 142, 143, 150, 152, 155, 161],
+        "humaneval": list(range(164)),
+    }
 
-    def __init__(self, strip_prompt, k=[1, 10, 100], num_workers=16, timeout_factor=): # TODO
-        super().__init__(strip_prompt=strip_prompt, k=k, num_workers=num_workers, timeout=None)
-        # TODO
+    def __init__(self,
+        strip_prompt, k=[1, 10, 100], num_workers=16, timeout=20.,
+        subset="enamel", # list of problem IDs, or one of {"enamel", "enamel-algo", "enamel-impl", "humaneval"}
+        hardness=[0., 3., 3., 4.], memory_giga=4., timeout_factor=2., tolerence_sec=0.01, tests_path="cache/eval~tests.pkl",
+    ):
+        super().__init__(strip_prompt=strip_prompt, k=k, num_workers=num_workers, timeout=timeout)
+        if isinstance(subset, list):
+            self.subset = subset
+        else:
+            assert subset in self.DATASET_SUBSETS, f"unknown subset {repr(subset)}"
+            self.subset = self.DATASET_SUBSETS[subset]
+        self.hardness = hardness
+        self.memory_giga = memory_giga
+        self.timeout_factor = timeout_factor
+        self.tolerence_sec = tolerence_sec
+        self.tests_path = tests_path
+        # TODO: load dataset and tests
 
     def get_dataset(self):
         # TODO: retrieve the evaluation subset from the loaded dataset (e.g. `self.dataset["test"]`)

From 72e50d37251fff70a7522d36ddae69a74f0bb4be Mon Sep 17 00:00:00 2001
From: q-rz <100142775+q-rz@users.noreply.github.com>
Date: Mon, 1 Jul 2024 00:16:41 +0000
Subject: [PATCH 03/28] Add a new benchmark ENAMEL

---
 bigcode_eval/tasks/enamel.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/bigcode_eval/tasks/enamel.py b/bigcode_eval/tasks/enamel.py
index cd9cb6d67..94521a35b 100644
--- a/bigcode_eval/tasks/enamel.py
+++ b/bigcode_eval/tasks/enamel.py
@@ -1,8 +1,8 @@
 """How efficient is LLM-generated code? A rigorous & high-standard benchmark
 https://arxiv.org/pdf/2406.06647
 
-ENAMEL is a rigorous & high-standard benchmark for evaluating the efficiency of generated code
-compared with **expert-written** reference solutions under 142 HumanEval problems
+ENAMEL is a rigorous & high-standard benchmark for evaluating the efficiency of generated
+Python code compared with expert-written reference solutions under 142 HumanEval problems
 
 Homepage: https://github.com/q-rz/enamel
 """

From 3d4c2752dca4d044a666fb8eaa83da9a477bed4f Mon Sep 17 00:00:00 2001
From: q-rz <100142775+q-rz@users.noreply.github.com>
Date: Mon, 1 Jul 2024 05:03:53 +0000
Subject: [PATCH 04/28] Add a new benchmark ENAMEL

---
 bigcode_eval/tasks/__init__.py |  3 +-
 bigcode_eval/tasks/enamel.py   | 54 ++++++++++++++++++----------------
 2 files changed, 30 insertions(+), 27 deletions(-)

diff --git a/bigcode_eval/tasks/__init__.py b/bigcode_eval/tasks/__init__.py
index 8162a5f1a..e94f4099b 100644
--- a/bigcode_eval/tasks/__init__.py
+++ b/bigcode_eval/tasks/__init__.py
@@ -5,7 +5,7 @@
                concode, ds1000, gsm, humaneval, humanevalplus, humanevalpack,
                instruct_humaneval, instruct_wizard_humaneval, mbpp, mbppplus,
                multiple, parity, python_bugs, quixbugs, recode, santacoder_fim,
-               studenteval, mercury)
+               studenteval, mercury, enamel)
 
 TASK_REGISTRY = {
     **apps.create_all_tasks(),
@@ -31,6 +31,7 @@
     **santacoder_fim.create_all_tasks(),
     "studenteval": studenteval.StudentEval,
     "mercury": mercury.Mercury,
+    **enamel.create_all_tasks(),
 }
 
 ALL_TASKS = sorted(list(TASK_REGISTRY))
diff --git a/bigcode_eval/tasks/enamel.py b/bigcode_eval/tasks/enamel.py
index 94521a35b..7952ee4fe 100644
--- a/bigcode_eval/tasks/enamel.py
+++ b/bigcode_eval/tasks/enamel.py
@@ -8,7 +8,7 @@
 """
 
 from warnings import warn
-from bigcode_eval.humaneval import GeneralHumanEval
+from bigcode_eval.tasks.humaneval import GeneralHumanEval
 
 _CITATION = """
 @article{qiu2024enamel,
@@ -20,7 +20,7 @@
 """
 
 
-class ENAMEL(GeneralHumanEval):
+class GeneralENAMEL(GeneralHumanEval):
     """A task represents an entire benchmark including its dataset, problems,
     answers, generation settings and evaluation methods.
     """
@@ -28,18 +28,17 @@ class ENAMEL(GeneralHumanEval):
     DATASET_PATH = "q-rz/enamel"
     DATASET_NAME = "ENAMEL_HumanEval"
     DATASET_SUBSETS = {
-        "enamel": sorted(set(range(164)) - {2, 23, 41, 45, 53, 60, 71, 92, 97, 99, 102, 123, 124, 135, 137, 138, 144, 148, 156, 157, 159, 160}),
-        "enamel-algo": [10, 18, 36, 39, 40, 43, 46, 49, 55, 59, 63, 76, 83, 96, 107, 109, 114, 129, 147, 154],
-        "enamel-impl": [1, 5, 8, 9, 11, 12, 15, 16, 17, 19, 21, 22, 24, 25, 26, 27, 31, 33, 37, 38, 44, 48, 49, 50, 51, 52, 56, 57, 58, 59, 61, 64, 66, 69, 70, 72, 73, 74, 75, 78, 80, 82, 85, 87, 89, 91, 93, 94, 95, 96, 98, 100, 104, 105, 108, 110, 111, 112, 113, 116, 117, 118, 121, 122, 125, 127, 128, 131, 140, 142, 143, 150, 152, 155, 161],
-        "humaneval": list(range(164)),
+        "ENAMEL": sorted(set(range(164)) - {2, 23, 41, 45, 53, 60, 71, 92, 97, 99, 102, 123, 124, 135, 137, 138, 144, 148, 156, 157, 159, 160}),
+        "ENAMEL_Algo": [10, 18, 36, 39, 40, 43, 46, 49, 55, 59, 63, 76, 83, 96, 107, 109, 114, 129, 147, 154],
+        "ENAMEL_Impl": [1, 5, 8, 9, 11, 12, 15, 16, 17, 19, 21, 22, 24, 25, 26, 27, 31, 33, 37, 38, 44, 48, 49, 50, 51, 52, 56, 57, 58, 59, 61, 64, 66, 69, 70, 72, 73, 74, 75, 78, 80, 82, 85, 87, 89, 91, 93, 94, 95, 96, 98, 100, 104, 105, 108, 110, 111, 112, 113, 116, 117, 118, 121, 122, 125, 127, 128, 131, 140, 142, 143, 150, 152, 155, 161],
     }
 
     def __init__(self,
-        strip_prompt, k=[1, 10, 100], num_workers=16, timeout=20.,
-        subset="enamel", # list of problem IDs, or one of {"enamel", "enamel-algo", "enamel-impl", "humaneval"}
+        subset, # list of problem IDs, or one of the predefined subsets
         hardness=[0., 3., 3., 4.], memory_giga=4., timeout_factor=2., tolerence_sec=0.01, tests_path="cache/eval~tests.pkl",
+        strip_prompt=True, k=[1, 10, 100], num_workers=16,
     ):
-        super().__init__(strip_prompt=strip_prompt, k=k, num_workers=num_workers, timeout=timeout)
+        super().__init__(strip_prompt=strip_prompt, k=k, num_workers=num_workers, timeout=None) # each problem has a different time limit
         if isinstance(subset, list):
             self.subset = subset
         else:
@@ -53,24 +52,8 @@ def __init__(self,
         # TODO: load dataset and tests
 
     def get_dataset(self):
-        # TODO: retrieve the evaluation subset from the loaded dataset (e.g. `self.dataset["test"]`)
         """Returns dataset for the task or an iterable of any object, that get_prompt can handle"""
-        return []
-
-    def fewshot_examples(self):
-        # TODO: load few-shot examples (from bigcode_eval/tasks/fewshot_examples) if they exist
-        """Loads and returns the few-shot examples for the task if they exist."""
-        pass
-
-    def get_prompt(self, doc):
-        # TODO: build the prompt for the language model from a sample `doc` from the dataset
-        """
-        Builds the prompt for the LM to generate from.
-        :param doc: dict[str: str]
-            sample from the test dataset
-        :return: str
-        """
-        return ""
+        return self.dataset["ENAMEL_HumanEval"].iloc[np.array(self.subset), :]
 
     def get_reference(self, doc):
         # TODO: get the reference solution from a sample `doc` from the dataset
@@ -108,3 +91,22 @@ def process_results(self, generations, references):
         :return: dict[str: float]
         """
         return {}
+
+
+def create_task(subset):
+    class ENAMEL(GeneralEnamel):
+        __name__ = subset
+        __qualname__ = subset
+        def __init__(self, *args, **kwargs):
+            super().__init__(subset = subset, *args, **kwargs)
+    return ENAMEL
+
+def create_all_tasks():
+    """Creates a dictionary of tasks from a list of levels
+    :return: {task_name: task}
+    """
+    return {
+        "enamel": create_task(subset = "ENAMEL"),
+        "enamel-algo": create_task(subset = "ENAMEL_Algo"),
+        "enamel-impl": create_task(subset = "ENAMEL_Impl"),
+    }

From 7847bb073333537a996871a02b3c959f4e881606 Mon Sep 17 00:00:00 2001
From: q-rz <100142775+q-rz@users.noreply.github.com>
Date: Mon, 1 Jul 2024 05:07:50 +0000
Subject: [PATCH 05/28] Add a new benchmark ENAMEL

---
 bigcode_eval/tasks/custom_metrics/enamel_eval.py | 1 +
 bigcode_eval/tasks/enamel.py                     | 2 +-
 2 files changed, 2 insertions(+), 1 deletion(-)
 create mode 100644 bigcode_eval/tasks/custom_metrics/enamel_eval.py

diff --git a/bigcode_eval/tasks/custom_metrics/enamel_eval.py b/bigcode_eval/tasks/custom_metrics/enamel_eval.py
new file mode 100644
index 000000000..1d44af559
--- /dev/null
+++ b/bigcode_eval/tasks/custom_metrics/enamel_eval.py
@@ -0,0 +1 @@
+# TODO: eff@k
diff --git a/bigcode_eval/tasks/enamel.py b/bigcode_eval/tasks/enamel.py
index 7952ee4fe..1a612ea22 100644
--- a/bigcode_eval/tasks/enamel.py
+++ b/bigcode_eval/tasks/enamel.py
@@ -94,7 +94,7 @@ def process_results(self, generations, references):
 
 
 def create_task(subset):
-    class ENAMEL(GeneralEnamel):
+    class ENAMEL(GeneralENAMEL):
         __name__ = subset
         __qualname__ = subset
         def __init__(self, *args, **kwargs):

From 48c2f1c40e60339a78f1788dd1cfe355a1d73c61 Mon Sep 17 00:00:00 2001
From: q-rz <100142775+q-rz@users.noreply.github.com>
Date: Mon, 1 Jul 2024 06:35:35 +0000
Subject: [PATCH 06/28] Add a new benchmark ENAMEL

---
 bigcode_eval/tasks/enamel.py | 37 ++++++++++++++++++------------------
 1 file changed, 18 insertions(+), 19 deletions(-)

diff --git a/bigcode_eval/tasks/enamel.py b/bigcode_eval/tasks/enamel.py
index 1a612ea22..f09d0a1ec 100644
--- a/bigcode_eval/tasks/enamel.py
+++ b/bigcode_eval/tasks/enamel.py
@@ -8,7 +8,9 @@
 """
 
 from warnings import warn
+import numpy as np
 from bigcode_eval.tasks.humaneval import GeneralHumanEval
+from bigcode_eval.custom_metrics.enamel_eval import 
 
 _CITATION = """
 @article{qiu2024enamel,
@@ -27,14 +29,8 @@ class GeneralENAMEL(GeneralHumanEval):
 
     DATASET_PATH = "q-rz/enamel"
     DATASET_NAME = "ENAMEL_HumanEval"
-    DATASET_SUBSETS = {
-        "ENAMEL": sorted(set(range(164)) - {2, 23, 41, 45, 53, 60, 71, 92, 97, 99, 102, 123, 124, 135, 137, 138, 144, 148, 156, 157, 159, 160}),
-        "ENAMEL_Algo": [10, 18, 36, 39, 40, 43, 46, 49, 55, 59, 63, 76, 83, 96, 107, 109, 114, 129, 147, 154],
-        "ENAMEL_Impl": [1, 5, 8, 9, 11, 12, 15, 16, 17, 19, 21, 22, 24, 25, 26, 27, 31, 33, 37, 38, 44, 48, 49, 50, 51, 52, 56, 57, 58, 59, 61, 64, 66, 69, 70, 72, 73, 74, 75, 78, 80, 82, 85, 87, 89, 91, 93, 94, 95, 96, 98, 100, 104, 105, 108, 110, 111, 112, 113, 116, 117, 118, 121, 122, 125, 127, 128, 131, 140, 142, 143, 150, 152, 155, 161],
-    }
 
-    def __init__(self,
-        subset, # list of problem IDs, or one of the predefined subsets
+    def __init__(self, subset, # list of problem IDs
         hardness=[0., 3., 3., 4.], memory_giga=4., timeout_factor=2., tolerence_sec=0.01, tests_path="cache/eval~tests.pkl",
         strip_prompt=True, k=[1, 10, 100], num_workers=16,
     ):
@@ -44,29 +40,29 @@ def __init__(self,
         else:
             assert subset in self.DATASET_SUBSETS, f"unknown subset {repr(subset)}"
             self.subset = self.DATASET_SUBSETS[subset]
+        self.dataset[self.__name__] = self.dataset["ENAMEL_HumanEval"].iloc[np.array(self.subset), :] # TODO
         self.hardness = hardness
         self.memory_giga = memory_giga
         self.timeout_factor = timeout_factor
         self.tolerence_sec = tolerence_sec
         self.tests_path = tests_path
-        # TODO: load dataset and tests
+        # TODO: load tests from tests_path
 
     def get_dataset(self):
         """Returns dataset for the task or an iterable of any object, that get_prompt can handle"""
-        return self.dataset["ENAMEL_HumanEval"].iloc[np.array(self.subset), :]
+        return self.dataset[self.__name__]
 
     def get_reference(self, doc):
         # TODO: get the reference solution from a sample `doc` from the dataset
         """
         Builds the reference solution for the doc (sample from the test dataset).
-        :param doc: dict[str: str]
+        :param doc: dict{str: str}
             sample from the test dataset
         :return: str
         """
         return ""
 
     def postprocess_generation(self, generation, idx):
-        # TODO: define the postprocessing for the LM generation
         """
         Defines the postprocessing for a LM generation.
         :param generation: str
@@ -75,7 +71,9 @@ def postprocess_generation(self, generation, idx):
             index of doc in the dataset to which the generation belongs
         :return: str
         """
-        return ""
+        prompt = self.get_prompt(self.get_dataset()[idx])
+        generation = self._stop_at_stop_token(generation, self.stop_words)
+        return prompt + "\n    pass\n" + generation # this should work no matter generation contains prompt or not
 
     def process_results(self, generations, references):
         # TODO: define how the evaluation score is computed from list of \
@@ -93,12 +91,13 @@ def process_results(self, generations, references):
         return {}
 
 
-def create_task(subset):
+def create_task(name, subset):
     class ENAMEL(GeneralENAMEL):
-        __name__ = subset
-        __qualname__ = subset
+        __name__ = name
+        __qualname__ = name
+        SUBSET = subset
         def __init__(self, *args, **kwargs):
-            super().__init__(subset = subset, *args, **kwargs)
+            super().__init__(subset=self.SUBSET, *args, **kwargs)
     return ENAMEL
 
 def create_all_tasks():
@@ -106,7 +105,7 @@ def create_all_tasks():
     :return: {task_name: task}
     """
     return {
-        "enamel": create_task(subset = "ENAMEL"),
-        "enamel-algo": create_task(subset = "ENAMEL_Algo"),
-        "enamel-impl": create_task(subset = "ENAMEL_Impl"),
+        "enamel": create_task(name="ENAMEL", subset=sorted(set(range(164)) - {2, 23, 41, 45, 53, 60, 71, 92, 97, 99, 102, 123, 124, 135, 137, 138, 144, 148, 156, 157, 159, 160})),
+        "enamel-algo": create_task(name="ENAMEL_Algo", subset=[10, 18, 36, 39, 40, 43, 46, 49, 55, 59, 63, 76, 83, 96, 107, 109, 114, 129, 147, 154]),
+        "enamel-impl": create_task(name="ENAMEL_Impl", subset=[1, 5, 8, 9, 11, 12, 15, 16, 17, 19, 21, 22, 24, 25, 26, 27, 31, 33, 37, 38, 44, 48, 49, 50, 51, 52, 56, 57, 58, 59, 61, 64, 66, 69, 70, 72, 73, 74, 75, 78, 80, 82, 85, 87, 89, 91, 93, 94, 95, 96, 98, 100, 104, 105, 108, 110, 111, 112, 113, 116, 117, 118, 121, 122, 125, 127, 128, 131, 140, 142, 143, 150, 152, 155, 161]),
     }

From 02c43e9a6540d836afce1e7abd18034d414bb349 Mon Sep 17 00:00:00 2001
From: q-rz <100142775+q-rz@users.noreply.github.com>
Date: Mon, 1 Jul 2024 07:39:12 +0000
Subject: [PATCH 07/28] Add a new benchmark ENAMEL

---
 .../tasks/custom_metrics/enamel_eval.py       | 12 ++++++++
 bigcode_eval/tasks/enamel.py                  | 30 +++++++++++--------
 2 files changed, 29 insertions(+), 13 deletions(-)

diff --git a/bigcode_eval/tasks/custom_metrics/enamel_eval.py b/bigcode_eval/tasks/custom_metrics/enamel_eval.py
index 1d44af559..ff7b04464 100644
--- a/bigcode_eval/tasks/custom_metrics/enamel_eval.py
+++ b/bigcode_eval/tasks/custom_metrics/enamel_eval.py
@@ -1 +1,13 @@
 # TODO: eff@k
+
+def evaluate_all(generations, references, k, hardness, n_reps, memory_giga, timeout_factor, tolerence_sec):
+    # TODO
+
+def might_catch_timeout_signal():
+    # TODO
+
+might_catch_timeout_signal.WARNING = """\
+We have detected that the generated code samples use `try ... except` within a loop, which might catch \
+our timeout signal and cause a dead loop. Since resolving this rare issue via `multiprocessing` would \
+significantly slow down the evaluation process for our large-scale inputs, we have decided not to resolve \
+this issue. If this issue does happen, please consider removing the corresponding code samples."""
diff --git a/bigcode_eval/tasks/enamel.py b/bigcode_eval/tasks/enamel.py
index f09d0a1ec..489622472 100644
--- a/bigcode_eval/tasks/enamel.py
+++ b/bigcode_eval/tasks/enamel.py
@@ -7,11 +7,6 @@
 Homepage: https://github.com/q-rz/enamel
 """
 
-from warnings import warn
-import numpy as np
-from bigcode_eval.tasks.humaneval import GeneralHumanEval
-from bigcode_eval.custom_metrics.enamel_eval import 
-
 _CITATION = """
 @article{qiu2024enamel,
   title={How efficient is {LLM}-generated code? A rigorous \& high-standard benchmark},
@@ -22,6 +17,12 @@
 """
 
 
+from warnings import warn
+import numpy as np
+from bigcode_eval.tasks.humaneval import GeneralHumanEval
+from bigcode_eval.custom_metrics.enamel_eval import evaluate_all, might_catch_timeout_signal
+
+
 class GeneralENAMEL(GeneralHumanEval):
     """A task represents an entire benchmark including its dataset, problems,
     answers, generation settings and evaluation methods.
@@ -31,10 +32,10 @@ class GeneralENAMEL(GeneralHumanEval):
     DATASET_NAME = "ENAMEL_HumanEval"
 
     def __init__(self, subset, # list of problem IDs
-        hardness=[0., 3., 3., 4.], memory_giga=4., timeout_factor=2., tolerence_sec=0.01, tests_path="cache/eval~tests.pkl",
-        strip_prompt=True, k=[1, 10, 100], num_workers=16,
+        hardness=[0., 3., 3., 4.], n_reps = 6, memory_giga=4., timeout_factor=2., tolerence_sec=0.01, tests_path="cache/eval~tests.pkl",
+        strip_prompt=True, k=[1, 10, 100],
     ):
-        super().__init__(strip_prompt=strip_prompt, k=k, num_workers=num_workers, timeout=None) # each problem has a different time limit
+        super().__init__(strip_prompt=strip_prompt, k=k, num_workers=1, timeout=None) # each problem has a different time limit
         if isinstance(subset, list):
             self.subset = subset
         else:
@@ -42,6 +43,7 @@ def __init__(self, subset, # list of problem IDs
             self.subset = self.DATASET_SUBSETS[subset]
         self.dataset[self.__name__] = self.dataset["ENAMEL_HumanEval"].iloc[np.array(self.subset), :] # TODO
         self.hardness = hardness
+        self.n_reps = n_reps
         self.memory_giga = memory_giga
         self.timeout_factor = timeout_factor
         self.tolerence_sec = tolerence_sec
@@ -60,7 +62,7 @@ def get_reference(self, doc):
             sample from the test dataset
         :return: str
         """
-        return ""
+        return "" # TODO: include tests
 
     def postprocess_generation(self, generation, idx):
         """
@@ -73,22 +75,24 @@ def postprocess_generation(self, generation, idx):
         """
         prompt = self.get_prompt(self.get_dataset()[idx])
         generation = self._stop_at_stop_token(generation, self.stop_words)
+        if (not self.warned_dead_loop) and might_catch_timeout_signal(generation):
+            warn(might_catch_timeout_signal.WARNING)
         return prompt + "\n    pass\n" + generation # this should work no matter generation contains prompt or not
 
     def process_results(self, generations, references):
-        # TODO: define how the evaluation score is computed from list of \
-        # generations and reference solutions
         """
         Takes the list of LM generations and evaluates them against ground truth references,
         returning the metric for the generations as in {"metric_name": result}.
-        We encourage to directly load the metric from `evaluate` library to keep the code concise.
         :param generations: list(list(str))
             list of lists containing generations
         :param references: list(str)
             list of str containing refrences
         :return: dict[str: float]
         """
-        return {}
+        return evaluate_all(
+            generations, references, k=self.k, hardness=self.hardness, n_reps=self.n_reps,
+            memory_giga=self.memory_giga, timeout_factor=self.timeout_factor, tolerence_sec=self.tolerence_sec,
+        )
 
 
 def create_task(name, subset):

From d1e10b97615d1e9dee805db159deb0e0d4b44155 Mon Sep 17 00:00:00 2001
From: q-rz <100142775+q-rz@users.noreply.github.com>
Date: Thu, 18 Jul 2024 06:38:20 +0000
Subject: [PATCH 08/28] Add a new benchmark ENAMEL

---
 .../tasks/custom_metrics/enamel_eval.py       | 21 +++++++++++++++++--
 bigcode_eval/tasks/enamel.py                  |  4 +++-
 2 files changed, 22 insertions(+), 3 deletions(-)

diff --git a/bigcode_eval/tasks/custom_metrics/enamel_eval.py b/bigcode_eval/tasks/custom_metrics/enamel_eval.py
index ff7b04464..bfc0e19b8 100644
--- a/bigcode_eval/tasks/custom_metrics/enamel_eval.py
+++ b/bigcode_eval/tasks/custom_metrics/enamel_eval.py
@@ -1,10 +1,27 @@
 # TODO: eff@k
 
+
+class Unpickler(pickle.Unpickler):
+    CLS_DICT = {'': Test, '': Refs}
+    def find_class(self, module, name):
+        if module in self.CLS_DICT:
+            return self.CLS_DICT[module]
+        else:
+            return super().find_class(module, name)
+
+
 def evaluate_all(generations, references, k, hardness, n_reps, memory_giga, timeout_factor, tolerence_sec):
     # TODO
 
-def might_catch_timeout_signal():
-    # TODO
+
+def might_catch_timeout_signal(generation, pattern_seq = ('    while ', '     try:')):
+    i = 0
+    for pattern in pattern_seq:
+        i = generarion.find(pattern, i)
+        if i == -1:
+            return False
+        i += len(pattern)
+    return True
 
 might_catch_timeout_signal.WARNING = """\
 We have detected that the generated code samples use `try ... except` within a loop, which might catch \
diff --git a/bigcode_eval/tasks/enamel.py b/bigcode_eval/tasks/enamel.py
index 489622472..09ef082ba 100644
--- a/bigcode_eval/tasks/enamel.py
+++ b/bigcode_eval/tasks/enamel.py
@@ -17,10 +17,11 @@
 """
 
 
+import pickle
 from warnings import warn
 import numpy as np
 from bigcode_eval.tasks.humaneval import GeneralHumanEval
-from bigcode_eval.custom_metrics.enamel_eval import evaluate_all, might_catch_timeout_signal
+from bigcode_eval.custom_metrics.enamel_eval import Unpickler, evaluate_all, might_catch_timeout_signal
 
 
 class GeneralENAMEL(GeneralHumanEval):
@@ -77,6 +78,7 @@ def postprocess_generation(self, generation, idx):
         generation = self._stop_at_stop_token(generation, self.stop_words)
         if (not self.warned_dead_loop) and might_catch_timeout_signal(generation):
             warn(might_catch_timeout_signal.WARNING)
+            self.warned_dead_loop = True
         return prompt + "\n    pass\n" + generation # this should work no matter generation contains prompt or not
 
     def process_results(self, generations, references):

From 86f3902209e949835903af205dc5f98c4ed3a897 Mon Sep 17 00:00:00 2001
From: q-rz <100142775+q-rz@users.noreply.github.com>
Date: Thu, 18 Jul 2024 07:30:38 +0000
Subject: [PATCH 09/28] Add a new benchmark ENAMEL

---
 .../tasks/custom_metrics/enamel_eval.py       | 325 +++++++++++++++++-
 1 file changed, 321 insertions(+), 4 deletions(-)

diff --git a/bigcode_eval/tasks/custom_metrics/enamel_eval.py b/bigcode_eval/tasks/custom_metrics/enamel_eval.py
index bfc0e19b8..f65813582 100644
--- a/bigcode_eval/tasks/custom_metrics/enamel_eval.py
+++ b/bigcode_eval/tasks/custom_metrics/enamel_eval.py
@@ -1,18 +1,176 @@
-# TODO: eff@k
+from copy import deepcopy
+import gc
+import pickle
+import time
 
+import os, os.path as osp
+import sys
+import resource
+import platform
+import contextlib
+
+import numpy as np
+
+def calc_exec_time(ts): # Hodges--Lehmann estimator
+    ts = np.array(ts) / 2.
+    ts = ts[None, :] + ts[:, None]
+    ts = ts[np.tril_indices_from(ts)]
+    return np.median(ts)
+
+def calc_eff(elapsed, ref, timeout):
+    return max(0., timeout - elapsed) / (timeout - ref)
+
+def calc_eff_at_k(e, k): # numerically stable implementation
+    n = len(e)
+    lbd = [k / n]
+    k_ = k - 1
+    for r in range(n - 1, k_, -1):
+        lbd.append(lbd[-1] * (1 - k_ / r))
+    lbd = np.flip(lbd)
+    e = np.sort(e)[k_ :]
+    return (lbd * e).sum()
+
+def calc_pass_at_k(n, c, k): # from the HumanEval paper
+    if n - c < k: return 1.0
+    return 1.0 - np.prod(1.0 - k / np.arange(n - c + 1, n + 1))
+
+class Test: # a test case
+    def __init__(self, input = None, answer = None, ref = None):
+        self.input = input
+        self.answer = answer
+        self.ref = ref # reference execution time
+
+class Refs: # references for efficiency evaluation
+    def __init__(self, tests, hardness):
+        neg_inf = float('-inf')
+        self.refs = [neg_inf] * len(hardness)
+        self.ref_max = neg_inf
+        self.lid = None
+        self.cid = None
+        # finds the longest reference execution time for calibration
+        for j, (size, tests_j) in enumerate(tests):
+            if hardness[j]:
+                for k, test in enumerate(tests_j):
+                    if self.refs[j] < test.ref:
+                        self.refs[j] = test.ref
+                        if self.ref_max < test.ref:
+                            self.ref_max = test.ref
+                            self.lid = j
+                            self.cid = k
 
 class Unpickler(pickle.Unpickler):
-    CLS_DICT = {'': Test, '': Refs}
+    CLS_DICT = {'enam.evaluate.Test': Test, 'enam.evaluate.Refs': Refs}
     def find_class(self, module, name):
         if module in self.CLS_DICT:
             return self.CLS_DICT[module]
         else:
             return super().find_class(module, name)
 
+TPL_RUN = '''%s
+%s
+__t0 = time.time()
+__output = %s(*__input)
+__t1 = time.time()
+''' # % (prompt, solution, entry_point)
+TPL_TEST = '''%s
+    pass
+%s
+__accepted = __check(__input, __answer, __output)
+''' # % (prompt, checker)
+
+def evaluate_one(code, problem, tests, refs, k, hardness, n_reps, memory_giga, timeout_factor, tolerence_sec, time_correction):
+    timeout = timeout_factor * refs.ref_max
+    memory_bytes = memory_giga * (1024 ** 3)
+    effs = []
+    elapsed_list = []
+    for j, (size, tests_j) in enumerate(tests):
+        n_reps_j = n_reps[j]
+        level_elapsed = []
+        level_break = False
+        for k, test in enumerate(tests_j):
+            elapsed = [None for rep in range(n_reps_j)]
+            for rep in range(n_reps):
+                scope = dict(time = time, input = None, print = None, __input = deepcopy(test.input)) # in case that the code modifies the input
+                try:
+                    unsafe_timed_execute(TPL_RUN % (problem.prompt, code, problem.entry_point), scope, memory_bytes, timeout + tolerence_sec)
+                    scope['__input'] = test.input
+                    scope['__answer'] = test.answer # to prevent the code reading the answer
+                    unsafe_execute(TPL_TEST % (problem.prompt, problem.checker), scope) # assuming that the checker does not modify the input
+                except TimeoutException as e:
+                    level_break = True
+                    break
+                except MemoryError as e:
+                    level_break = True
+                    break
+                except OverflowError as e:
+                    level_break = True
+                    break
+                except KeyboardInterrupt as e:
+                    raise e
+                except BaseException as e:
+                    return False, self.zero_effs(), elapsed_list
+                else:
+                    if '__accepted' in scope and scope['__accepted']:
+                        elapsed[rep] = scope['__t1'] - scope['__t0']
+                    else:
+                        return False, self.zero_effs(), elapsed_list
+            if level_break:
+                break
+            else:
+                level_elapsed.append(calc_exec_time(elapsed).item() * time_correction)
+        elapsed_list.append(level_elapsed)
+        if level_break:
+            break
+        else:
+            effs.append(calc_eff(elapsed = max(level_elapsed), ref = refs.refs[j], timeout = timeout))
+    if j == 0 and level_break:
+        return False, self.zero_effs(), elapsed_list
+    for j in range(len(effs), self.n_levels):
+        effs.append(0.)
+    return True, effs, elapsed_list
 
-def evaluate_all(generations, references, k, hardness, n_reps, memory_giga, timeout_factor, tolerence_sec):
-    # TODO
+def get_time_correction(problem, tests, refs, n_reps): # computes the calibration factor of of execution time
+    j = refs.lid
+    k = refs.cid
+    test = tests[j][-1][k]
+    n_reps_j = n_reps[j]
+    elapsed = [None for rep in range(n_reps_j)]
+    for rep in range(n_reps_j):
+        scope = dict(time = time, __input = deepcopy(test.input)) # in case that the code modifies the input
+        unsafe_execute(TPL_RUN % (problem.prompt, problem.reference_solution, problem.entry_point), scope) # assuming that the reference solution is error-free
+        elapsed[rep] = scope['__t1'] - scope['__t0']
+    elapsed = calc_exec_time(elapsed).item()
+    return refs.ref_max / elapsed
 
+def evaluate_all(problems, codes, tests, refs, k, hardness, n_reps, memory_giga, timeout_factor, tolerence_sec):
+    if isinstance(k, int):
+        k = [k]
+    min_codes = min(len(codes_i) for codes_i in codes)
+    k = sorted({k_ for k_ in k if k_ <= min_codes})
+    passes = [[] for k_ in k]
+    effs = [[] for k_ in k]
+    gc.collect()
+    for problem, codes_i, tests_i, refs_i in zip(problems, codes, tests, refs):
+        time_correction = get_time_correction(problem = problem, tests = tests_i, refs = refs_i, n_reps = n_reps)
+        n_levels = len(tests_i)
+        problem_passes = []
+        problem_effs = []
+        for code in codes_i:
+            passed, code_effs, code_elapsed = evaluate_one(
+                code = code, problem = problem, tests = tests_i, refs = refs_i,
+                k = k, hardness = hardness, n_reps = n_reps, memory_giga = memory_giga,
+                timeout_factor = timeout_factor, tolerence_sec = tolerence_sec, time_correction = time_correction)
+            problem_passes.append(passed)
+            problem_effs.append(code_effs)
+        for j, k_ in enumerate(k):
+            passes[j].append(calc_pass_at_k(n = len(problem_passes), c = sum(problem_passes), k = k_))
+            effs[j].append(calc_eff_at_k(e = np.average(problem_effs, axis = 1, weights = hardness), k = k_))
+    metrics = dict()
+    for k_, pass_k in zip(k, passes):
+        metrics[f'pass@{k_}'] = np.mean(pass_k).item()
+    for k_, eff_k in zip(k, effs):
+        metrics[f'eff@{k_}'] = np.mean(eff_k).item()
+    return metrics
 
 def might_catch_timeout_signal(generation, pattern_seq = ('    while ', '     try:')):
     i = 0
@@ -28,3 +186,162 @@ def might_catch_timeout_signal(generation, pattern_seq = ('    while ', '     tr
 our timeout signal and cause a dead loop. Since resolving this rare issue via `multiprocessing` would \
 significantly slow down the evaluation process for our large-scale inputs, we have decided not to resolve \
 this issue. If this issue does happen, please consider removing the corresponding code samples."""
+
+"""The following functions are adapted from code_eval (@link https://huggingface.co/spaces/evaluate-metric/code_eval)"""
+
+def get_memory_usage():
+    return sys.getsizeof(sys.modules[__name__])
+
+@contextlib.contextmanager
+def set_memory_limit(maximum_memory_bytes = None):
+    try:
+        if maximum_memory_bytes is not None:
+            _not_darwin = (not platform.uname().system == "Darwin")
+            _rlimit_as = resource.getrlimit(resource.RLIMIT_AS)
+            _rlimit_data = resource.getrlimit(resource.RLIMIT_DATA)
+            if _not_darwin:
+                _rlimit_stack = resource.getrlimit(resource.RLIMIT_STACK)
+            memory_limit = int(get_memory_usage() + maximum_memory_bytes)
+            resource.setrlimit(resource.RLIMIT_AS, (memory_limit, _rlimit_as[-1]))
+            resource.setrlimit(resource.RLIMIT_DATA, (memory_limit, _rlimit_data[-1]))
+            if _not_darwin:
+                resource.setrlimit(resource.RLIMIT_STACK, (memory_limit, _rlimit_stack[-1]))
+        yield
+    finally:
+        if maximum_memory_bytes is not None:
+            resource.setrlimit(resource.RLIMIT_AS, _rlimit_as)
+            resource.setrlimit(resource.RLIMIT_DATA, _rlimit_data)
+            if _not_darwin:
+                resource.setrlimit(resource.RLIMIT_STACK, _rlimit_stack)
+
+class TimeoutException(Exception):
+    pass
+
+def timeout_signal_handler(signum, frame):
+    raise TimeoutException("Timed out!")
+
+@contextlib.contextmanager
+def set_time_limit(seconds):
+    import signal
+    signal.setitimer(signal.ITIMER_REAL, seconds)
+    signal.signal(signal.SIGALRM, timeout_signal_handler)
+    try:
+        yield
+    finally:
+        signal.setitimer(signal.ITIMER_REAL, 0)
+
+import io
+
+class WriteOnlyStringIO(io.StringIO):
+    def read(self, *args, **kwargs):
+        raise OSError
+    def readline(self, *args, **kwargs):
+        raise OSError
+    def readlines(self, *args, **kwargs):
+        raise OSError
+    def readable(self, *args, **kwargs):
+        return False
+
+class redirect_stdin(contextlib._RedirectStream):  # type: ignore
+    _stream = "stdin"
+
+@contextlib.contextmanager
+def swallow_io():
+    stream = WriteOnlyStringIO()
+    with contextlib.redirect_stdout(stream):
+        with contextlib.redirect_stderr(stream):
+            with redirect_stdin(stream):
+                yield
+
+@contextlib.contextmanager
+def chdir(root):
+    if root == ".":
+        yield
+        return
+    cwd = os.getcwd()
+    os.chdir(root)
+    try:
+        yield
+    except BaseException as exc:
+        raise exc
+    finally:
+        os.chdir(cwd)
+
+@contextlib.contextmanager
+def create_tempdir():
+    import tempfile
+    with tempfile.TemporaryDirectory() as dirname:
+        with chdir(dirname):
+            yield dirname
+
+@contextlib.contextmanager
+def reliability_guard():
+    """
+    This disables various destructive functions and prevents the generated code
+    from interfering with the test (e.g. fork bomb, killing other processes,
+    removing filesystem files, etc.)
+
+    WARNING
+    This function is NOT a security sandbox. Untrusted code, including, model-
+    generated code, should not be blindly executed outside of one. See the
+    Codex paper for more information about OpenAI's code sandbox, and proceed
+    with caution.
+    """
+
+    with create_tempdir():
+        with swallow_io():
+            try:
+
+                import faulthandler
+
+                faulthandler.disable()
+
+                import builtins, os, shutil, subprocess
+
+                os.environ["OMP_NUM_THREADS"] = "1"
+
+                _keys = dict(
+                    builtins = ('exit', 'quit'),
+                    os = ('kill', 'system', 'putenv', 'remove', 'removedirs', 'rmdir', 'fchdir', 'setuid', 'fork', 'forkpty', 'killpg', 'rename', 'renames', 'truncate', 'replace', 'unlink', 'fchmod', 'fchown', 'chmod', 'chown', 'chroot', 'lchflags', 'lchmod', 'lchown', 'getcwd', 'chdir'),
+                    shutil = ('rmtree', 'move', 'chown'),
+                    subprocess = ('Popen',),
+                )
+                _baks = dict()
+                for lib, keys in _keys.items():
+                    obj = locals()[lib]
+                    _bak = dict()
+                    for key in keys:
+                        if hasattr(obj, key):
+                            _bak[key] = getattr(obj, key)
+                    _baks[lib] = _bak
+
+                #__builtins__["help"] = None
+
+                yield
+            finally:
+                for lib, keys in _keys.items():
+                    obj = locals()[lib]
+                    for key, val in _baks[lib].items():
+                        setattr(obj, key, val)
+
+def unsafe_execute(program: str, exec_globals: dict):
+    try:
+        gc_bak = gc.isenabled()
+        gc.disable()
+        with reliability_guard():
+            exec(program, exec_globals)
+    finally:
+        if gc_bak:
+            gc.enable()
+
+def unsafe_timed_execute(program: str, exec_globals: dict, maximum_memory_bytes: float, time_limit_seconds: float):
+    try:
+        gc_bak = gc.isenabled()
+        gc.disable()
+        with reliability_guard():
+            with set_memory_limit(maximum_memory_bytes):
+                with set_time_limit(time_limit_seconds):
+                    exec(program, exec_globals)
+    finally:
+        if gc_bak:
+            gc.enable()

From 4caa5dde9d69d6ff65088a260ee2a4c88cc1844b Mon Sep 17 00:00:00 2001
From: q-rz <100142775+q-rz@users.noreply.github.com>
Date: Thu, 18 Jul 2024 07:37:30 +0000
Subject: [PATCH 10/28] Add a new benchmark ENAMEL

---
 bigcode_eval/tasks/custom_metrics/enamel_eval.py | 2 +-
 bigcode_eval/tasks/enamel.py                     | 3 +--
 2 files changed, 2 insertions(+), 3 deletions(-)

diff --git a/bigcode_eval/tasks/custom_metrics/enamel_eval.py b/bigcode_eval/tasks/custom_metrics/enamel_eval.py
index f65813582..694e56d11 100644
--- a/bigcode_eval/tasks/custom_metrics/enamel_eval.py
+++ b/bigcode_eval/tasks/custom_metrics/enamel_eval.py
@@ -71,7 +71,7 @@ def find_class(self, module, name):
 __t0 = time.time()
 __output = %s(*__input)
 __t1 = time.time()
-''' # % (prompt, solution, entry_point)
+''' # % (prompt, code, entry_point) # this should work no matter code includes prompt or not
 TPL_TEST = '''%s
     pass
 %s
diff --git a/bigcode_eval/tasks/enamel.py b/bigcode_eval/tasks/enamel.py
index 09ef082ba..e865f5b65 100644
--- a/bigcode_eval/tasks/enamel.py
+++ b/bigcode_eval/tasks/enamel.py
@@ -74,12 +74,11 @@ def postprocess_generation(self, generation, idx):
             index of doc in the dataset to which the generation belongs
         :return: str
         """
-        prompt = self.get_prompt(self.get_dataset()[idx])
         generation = self._stop_at_stop_token(generation, self.stop_words)
         if (not self.warned_dead_loop) and might_catch_timeout_signal(generation):
             warn(might_catch_timeout_signal.WARNING)
             self.warned_dead_loop = True
-        return prompt + "\n    pass\n" + generation # this should work no matter generation contains prompt or not
+        return generation
 
     def process_results(self, generations, references):
         """

From 027afcb58384287509cac5717b856ed77c2c7b19 Mon Sep 17 00:00:00 2001
From: q-rz <100142775+q-rz@users.noreply.github.com>
Date: Thu, 18 Jul 2024 07:55:08 +0000
Subject: [PATCH 11/28] Add a new benchmark ENAMEL

---
 bigcode_eval/tasks/custom_metrics/enamel_eval.py | 5 ++---
 bigcode_eval/tasks/enamel.py                     | 3 ++-
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/bigcode_eval/tasks/custom_metrics/enamel_eval.py b/bigcode_eval/tasks/custom_metrics/enamel_eval.py
index 694e56d11..dbd67cb9d 100644
--- a/bigcode_eval/tasks/custom_metrics/enamel_eval.py
+++ b/bigcode_eval/tasks/custom_metrics/enamel_eval.py
@@ -3,7 +3,8 @@
 import pickle
 import time
 
-import os, os.path as osp
+import io
+import os
 import sys
 import resource
 import platform
@@ -230,8 +231,6 @@ def set_time_limit(seconds):
     finally:
         signal.setitimer(signal.ITIMER_REAL, 0)
 
-import io
-
 class WriteOnlyStringIO(io.StringIO):
     def read(self, *args, **kwargs):
         raise OSError
diff --git a/bigcode_eval/tasks/enamel.py b/bigcode_eval/tasks/enamel.py
index e865f5b65..f45f62106 100644
--- a/bigcode_eval/tasks/enamel.py
+++ b/bigcode_eval/tasks/enamel.py
@@ -44,7 +44,8 @@ def __init__(self, subset, # list of problem IDs
             self.subset = self.DATASET_SUBSETS[subset]
         self.dataset[self.__name__] = self.dataset["ENAMEL_HumanEval"].iloc[np.array(self.subset), :] # TODO
         self.hardness = hardness
-        self.n_reps = n_reps
+        self.n_levels = len(self.hardness)
+        self.n_reps = [n_reps if self.hardness[j] else 1 for j in range(self.n_levels)] # no need to repeat if it does not count into the efficiency score
         self.memory_giga = memory_giga
         self.timeout_factor = timeout_factor
         self.tolerence_sec = tolerence_sec

From eb4310325b418352843107469e897f75e9fcd14c Mon Sep 17 00:00:00 2001
From: q-rz <100142775+q-rz@users.noreply.github.com>
Date: Thu, 18 Jul 2024 08:15:07 +0000
Subject: [PATCH 12/28] Add a new benchmark ENAMEL

---
 bigcode_eval/tasks/enamel.py | 11 ++++-------
 1 file changed, 4 insertions(+), 7 deletions(-)

diff --git a/bigcode_eval/tasks/enamel.py b/bigcode_eval/tasks/enamel.py
index f45f62106..671461790 100644
--- a/bigcode_eval/tasks/enamel.py
+++ b/bigcode_eval/tasks/enamel.py
@@ -21,7 +21,7 @@
 from warnings import warn
 import numpy as np
 from bigcode_eval.tasks.humaneval import GeneralHumanEval
-from bigcode_eval.custom_metrics.enamel_eval import Unpickler, evaluate_all, might_catch_timeout_signal
+from bigcode_eval.tasks.custom_metrics.enamel_eval import Unpickler, evaluate_all, might_catch_timeout_signal
 
 
 class GeneralENAMEL(GeneralHumanEval):
@@ -30,18 +30,15 @@ class GeneralENAMEL(GeneralHumanEval):
     """
 
     DATASET_PATH = "q-rz/enamel"
-    DATASET_NAME = "ENAMEL_HumanEval"
+    DATASET_NAME = "default"
 
     def __init__(self, subset, # list of problem IDs
         hardness=[0., 3., 3., 4.], n_reps = 6, memory_giga=4., timeout_factor=2., tolerence_sec=0.01, tests_path="cache/eval~tests.pkl",
         strip_prompt=True, k=[1, 10, 100],
     ):
         super().__init__(strip_prompt=strip_prompt, k=k, num_workers=1, timeout=None) # each problem has a different time limit
-        if isinstance(subset, list):
-            self.subset = subset
-        else:
-            assert subset in self.DATASET_SUBSETS, f"unknown subset {repr(subset)}"
-            self.subset = self.DATASET_SUBSETS[subset]
+        self.subset = subset
+        return # @TODO
         self.dataset[self.__name__] = self.dataset["ENAMEL_HumanEval"].iloc[np.array(self.subset), :] # TODO
         self.hardness = hardness
         self.n_levels = len(self.hardness)

From f3e86ac72247ba322c00ae87be0a14452540f14e Mon Sep 17 00:00:00 2001
From: q-rz <100142775+q-rz@users.noreply.github.com>
Date: Sun, 21 Jul 2024 19:26:40 +0000
Subject: [PATCH 13/28] Add a new benchmark ENAMEL

---
 bigcode_eval/tasks/enamel.py | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/bigcode_eval/tasks/enamel.py b/bigcode_eval/tasks/enamel.py
index 671461790..4362d7574 100644
--- a/bigcode_eval/tasks/enamel.py
+++ b/bigcode_eval/tasks/enamel.py
@@ -20,6 +20,7 @@
 import pickle
 from warnings import warn
 import numpy as np
+from huggingface_hub import hf_hub_download
 from bigcode_eval.tasks.humaneval import GeneralHumanEval
 from bigcode_eval.tasks.custom_metrics.enamel_eval import Unpickler, evaluate_all, might_catch_timeout_signal
 
@@ -31,6 +32,7 @@ class GeneralENAMEL(GeneralHumanEval):
 
     DATASET_PATH = "q-rz/enamel"
     DATASET_NAME = "default"
+    DATASET_FULL = "ENAMEL_HumanEval"
 
     def __init__(self, subset, # list of problem IDs
         hardness=[0., 3., 3., 4.], n_reps = 6, memory_giga=4., timeout_factor=2., tolerence_sec=0.01, tests_path="cache/eval~tests.pkl",
@@ -38,20 +40,22 @@ def __init__(self, subset, # list of problem IDs
     ):
         super().__init__(strip_prompt=strip_prompt, k=k, num_workers=1, timeout=None) # each problem has a different time limit
         self.subset = subset
-        return # @TODO
-        self.dataset[self.__name__] = self.dataset["ENAMEL_HumanEval"].iloc[np.array(self.subset), :] # TODO
+        self.dataset_full = self.dataset[self.DATASET_FULL].to_pandas()
+        self.dataset = self.dataset.iloc[np.array(self.subset), :]
         self.hardness = hardness
         self.n_levels = len(self.hardness)
         self.n_reps = [n_reps if self.hardness[j] else 1 for j in range(self.n_levels)] # no need to repeat if it does not count into the efficiency score
         self.memory_giga = memory_giga
         self.timeout_factor = timeout_factor
         self.tolerence_sec = tolerence_sec
-        self.tests_path = tests_path
+        self.tests_path = hf_hub_download(repo_id = self.DATASET_PATH, filename = tests_path, repo_type = "dataset")
         # TODO: load tests from tests_path
 
     def get_dataset(self):
         """Returns dataset for the task or an iterable of any object, that get_prompt can handle"""
-        return self.dataset[self.__name__]
+        return self.dataset
+
+    #TODO get_prompt
 
     def get_reference(self, doc):
         # TODO: get the reference solution from a sample `doc` from the dataset

From 5aafc0e472c873ede52d27a5ff21f9e4a6e1f228 Mon Sep 17 00:00:00 2001
From: q-rz <100142775+q-rz@users.noreply.github.com>
Date: Sun, 21 Jul 2024 19:49:03 +0000
Subject: [PATCH 14/28] Add a new benchmark ENAMEL

---
 bigcode_eval/tasks/enamel.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/bigcode_eval/tasks/enamel.py b/bigcode_eval/tasks/enamel.py
index 4362d7574..6772ac619 100644
--- a/bigcode_eval/tasks/enamel.py
+++ b/bigcode_eval/tasks/enamel.py
@@ -41,7 +41,7 @@ def __init__(self, subset, # list of problem IDs
         super().__init__(strip_prompt=strip_prompt, k=k, num_workers=1, timeout=None) # each problem has a different time limit
         self.subset = subset
         self.dataset_full = self.dataset[self.DATASET_FULL].to_pandas()
-        self.dataset = self.dataset.iloc[np.array(self.subset), :]
+        self.dataset = self.dataset_full.iloc[np.array(self.subset), :]
         self.hardness = hardness
         self.n_levels = len(self.hardness)
         self.n_reps = [n_reps if self.hardness[j] else 1 for j in range(self.n_levels)] # no need to repeat if it does not count into the efficiency score

From afb847184e4bb6215e58cf6f1d89b87ab32e0092 Mon Sep 17 00:00:00 2001
From: q-rz <100142775+q-rz@users.noreply.github.com>
Date: Sun, 21 Jul 2024 21:12:29 +0000
Subject: [PATCH 15/28] Add a new benchmark ENAMEL

---
 bigcode_eval/tasks/custom_metrics/enamel_eval.py |  2 +-
 bigcode_eval/tasks/enamel.py                     | 10 +++++++---
 2 files changed, 8 insertions(+), 4 deletions(-)

diff --git a/bigcode_eval/tasks/custom_metrics/enamel_eval.py b/bigcode_eval/tasks/custom_metrics/enamel_eval.py
index dbd67cb9d..ea8e550c2 100644
--- a/bigcode_eval/tasks/custom_metrics/enamel_eval.py
+++ b/bigcode_eval/tasks/custom_metrics/enamel_eval.py
@@ -59,7 +59,7 @@ def __init__(self, tests, hardness):
                             self.lid = j
                             self.cid = k
 
-class Unpickler(pickle.Unpickler):
+class EnamUnpickler(pickle.Unpickler):
     CLS_DICT = {'enam.evaluate.Test': Test, 'enam.evaluate.Refs': Refs}
     def find_class(self, module, name):
         if module in self.CLS_DICT:
diff --git a/bigcode_eval/tasks/enamel.py b/bigcode_eval/tasks/enamel.py
index 6772ac619..91e826629 100644
--- a/bigcode_eval/tasks/enamel.py
+++ b/bigcode_eval/tasks/enamel.py
@@ -17,12 +17,12 @@
 """
 
 
-import pickle
 from warnings import warn
+import pickle
 import numpy as np
 from huggingface_hub import hf_hub_download
 from bigcode_eval.tasks.humaneval import GeneralHumanEval
-from bigcode_eval.tasks.custom_metrics.enamel_eval import Unpickler, evaluate_all, might_catch_timeout_signal
+from bigcode_eval.tasks.custom_metrics.enamel_eval import EnamUnpickler, evaluate_all, might_catch_timeout_signal
 
 
 class GeneralENAMEL(GeneralHumanEval):
@@ -48,8 +48,12 @@ def __init__(self, subset, # list of problem IDs
         self.memory_giga = memory_giga
         self.timeout_factor = timeout_factor
         self.tolerence_sec = tolerence_sec
+        if self.DATASET_PATH != 'q-rz/enamel':
+            warn(f"Tests are loaded from {self.DATASET_PATH}/{tests_path} by `pickle`. Unpickling files from an unknown provider can be unsafe.")
         self.tests_path = hf_hub_download(repo_id = self.DATASET_PATH, filename = tests_path, repo_type = "dataset")
-        # TODO: load tests from tests_path
+        with open(self.tests_path, 'rb') as fi:
+            self.tests_full = EnamUnpickler(fi).load()
+        self.tests = [self.tests_full[i] for i in self.subset]
 
     def get_dataset(self):
         """Returns dataset for the task or an iterable of any object, that get_prompt can handle"""

From 8cf92a6cdce84dc4c3082c9add0467ca18e5738f Mon Sep 17 00:00:00 2001
From: q-rz <100142775+q-rz@users.noreply.github.com>
Date: Sun, 21 Jul 2024 21:15:21 +0000
Subject: [PATCH 16/28] Add a new benchmark ENAMEL

---
 bigcode_eval/tasks/custom_metrics/enamel_eval.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/bigcode_eval/tasks/custom_metrics/enamel_eval.py b/bigcode_eval/tasks/custom_metrics/enamel_eval.py
index ea8e550c2..9b786c0a8 100644
--- a/bigcode_eval/tasks/custom_metrics/enamel_eval.py
+++ b/bigcode_eval/tasks/custom_metrics/enamel_eval.py
@@ -62,8 +62,9 @@ def __init__(self, tests, hardness):
 class EnamUnpickler(pickle.Unpickler):
     CLS_DICT = {'enam.evaluate.Test': Test, 'enam.evaluate.Refs': Refs}
     def find_class(self, module, name):
-        if module in self.CLS_DICT:
-            return self.CLS_DICT[module]
+        cls_name = f'{module}.{name}'
+        if cls_name in self.CLS_DICT:
+            return self.CLS_DICT[cls_name]
         else:
             return super().find_class(module, name)
 

From 71c69f6479420ef57809129e2f7412790eb34250 Mon Sep 17 00:00:00 2001
From: q-rz <100142775+q-rz@users.noreply.github.com>
Date: Sun, 21 Jul 2024 22:22:43 +0000
Subject: [PATCH 17/28] Add a new benchmark ENAMEL

---
 bigcode_eval/tasks/enamel.py | 49 +++++++++++++++++++++++-------------
 1 file changed, 32 insertions(+), 17 deletions(-)

diff --git a/bigcode_eval/tasks/enamel.py b/bigcode_eval/tasks/enamel.py
index 91e826629..8e40afc8f 100644
--- a/bigcode_eval/tasks/enamel.py
+++ b/bigcode_eval/tasks/enamel.py
@@ -22,7 +22,7 @@
 import numpy as np
 from huggingface_hub import hf_hub_download
 from bigcode_eval.tasks.humaneval import GeneralHumanEval
-from bigcode_eval.tasks.custom_metrics.enamel_eval import EnamUnpickler, evaluate_all, might_catch_timeout_signal
+from bigcode_eval.tasks.custom_metrics.enamel_eval import EnamUnpickler, Dict, evaluate_all, might_catch_timeout_signal
 
 
 class GeneralENAMEL(GeneralHumanEval):
@@ -32,44 +32,51 @@ class GeneralENAMEL(GeneralHumanEval):
 
     DATASET_PATH = "q-rz/enamel"
     DATASET_NAME = "default"
-    DATASET_FULL = "ENAMEL_HumanEval"
+    DATASET_ALL = "ENAMEL_HumanEval"
 
     def __init__(self, subset, # list of problem IDs
         hardness=[0., 3., 3., 4.], n_reps = 6, memory_giga=4., timeout_factor=2., tolerence_sec=0.01, tests_path="cache/eval~tests.pkl",
         strip_prompt=True, k=[1, 10, 100],
     ):
         super().__init__(strip_prompt=strip_prompt, k=k, num_workers=1, timeout=None) # each problem has a different time limit
-        self.subset = subset
-        self.dataset_full = self.dataset[self.DATASET_FULL].to_pandas()
-        self.dataset = self.dataset_full.iloc[np.array(self.subset), :]
+        self.subset = subset if isinstance(subset, list) else list(subset)
+        self.n_probs = len(self.subset)
+        self.dataset = self.dataset[self.DATASET_ALL].to_pandas().iloc[self.subset, :]
         self.hardness = hardness
         self.n_levels = len(self.hardness)
         self.n_reps = [n_reps if self.hardness[j] else 1 for j in range(self.n_levels)] # no need to repeat if it does not count into the efficiency score
         self.memory_giga = memory_giga
         self.timeout_factor = timeout_factor
         self.tolerence_sec = tolerence_sec
+        #warn(f"Problems here have been renumbered 0--{self.n_probs - 1} to compatibilize with `bigcode_eval`")
         if self.DATASET_PATH != 'q-rz/enamel':
             warn(f"Tests are loaded from {self.DATASET_PATH}/{tests_path} by `pickle`. Unpickling files from an unknown provider can be unsafe.")
         self.tests_path = hf_hub_download(repo_id = self.DATASET_PATH, filename = tests_path, repo_type = "dataset")
         with open(self.tests_path, 'rb') as fi:
-            self.tests_full = EnamUnpickler(fi).load()
-        self.tests = [self.tests_full[i] for i in self.subset]
+            tests_all, refs_all = EnamUnpickler(fi).load()
+            self.tests = [tests_all[i] for i in self.subset]
+            self.refs = [refs_all[i] for i in self.subset]
 
     def get_dataset(self):
-        """Returns dataset for the task or an iterable of any object, that get_prompt can handle"""
-        return self.dataset
+        """Returns dataset as an iterable of namedtuple"""
+        return self.dataset.itertuples(index=True)
 
-    #TODO get_prompt
+    def get_prompt(self, doc):
+        """
+        :param doc: namedtuple
+            a row from the dataset
+        :return: str
+        """
+        return super().get_prompt(doc._asdict())
 
     def get_reference(self, doc):
-        # TODO: get the reference solution from a sample `doc` from the dataset
         """
-        Builds the reference solution for the doc (sample from the test dataset).
-        :param doc: dict{str: str}
-            sample from the test dataset
-        :return: str
+        :param doc: namedtuple
+            a row from the dataset
+        :return: tuple (problem, tests, refs)
         """
-        return "" # TODO: include tests
+        i = doc.Index
+        return (doc, self.tests[i], self.refs[i])
 
     def postprocess_generation(self, generation, idx):
         """
@@ -96,8 +103,16 @@ def process_results(self, generations, references):
             list of str containing refrences
         :return: dict[str: float]
         """
+        problems = []
+        tests = []
+        refs = []
+        for problem, tests_i, refs_i in references:
+            problems.append(problem)
+            tests.append(tests_i)
+            refs.append(refs_i)
         return evaluate_all(
-            generations, references, k=self.k, hardness=self.hardness, n_reps=self.n_reps,
+            problems=problems, codes=generations, tests=tests, refs=refs,
+            k=self.k, hardness=self.hardness, n_reps=self.n_reps,
             memory_giga=self.memory_giga, timeout_factor=self.timeout_factor, tolerence_sec=self.tolerence_sec,
         )
 

From 32265c8e230dbc9a63b702154070370e3e92fabf Mon Sep 17 00:00:00 2001
From: q-rz <100142775+q-rz@users.noreply.github.com>
Date: Sun, 21 Jul 2024 22:32:13 +0000
Subject: [PATCH 18/28] Add a new benchmark ENAMEL

---
 bigcode_eval/tasks/enamel.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/bigcode_eval/tasks/enamel.py b/bigcode_eval/tasks/enamel.py
index 8e40afc8f..762faff75 100644
--- a/bigcode_eval/tasks/enamel.py
+++ b/bigcode_eval/tasks/enamel.py
@@ -22,7 +22,7 @@
 import numpy as np
 from huggingface_hub import hf_hub_download
 from bigcode_eval.tasks.humaneval import GeneralHumanEval
-from bigcode_eval.tasks.custom_metrics.enamel_eval import EnamUnpickler, Dict, evaluate_all, might_catch_timeout_signal
+from bigcode_eval.tasks.custom_metrics.enamel_eval import EnamUnpickler, evaluate_all, might_catch_timeout_signal
 
 
 class GeneralENAMEL(GeneralHumanEval):

From bef756648193448eb4eb25468e2f1a0462981de4 Mon Sep 17 00:00:00 2001
From: q-rz <100142775+q-rz@users.noreply.github.com>
Date: Sun, 21 Jul 2024 23:49:27 +0000
Subject: [PATCH 19/28] Add a new benchmark ENAMEL

---
 bigcode_eval/tasks/enamel.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/bigcode_eval/tasks/enamel.py b/bigcode_eval/tasks/enamel.py
index 762faff75..34207e70c 100644
--- a/bigcode_eval/tasks/enamel.py
+++ b/bigcode_eval/tasks/enamel.py
@@ -59,7 +59,7 @@ def __init__(self, subset, # list of problem IDs
 
     def get_dataset(self):
         """Returns dataset as an iterable of namedtuple"""
-        return self.dataset.itertuples(index=True)
+        return list(self.dataset.itertuples(index=True))
 
     def get_prompt(self, doc):
         """

From bf3348f0654a776482e7c1ae66196c6fdf8b5fd9 Mon Sep 17 00:00:00 2001
From: q-rz <100142775+q-rz@users.noreply.github.com>
Date: Sun, 21 Jul 2024 23:53:19 +0000
Subject: [PATCH 20/28] Add a new benchmark ENAMEL

---
 bigcode_eval/tasks/custom_metrics/enamel_eval.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/bigcode_eval/tasks/custom_metrics/enamel_eval.py b/bigcode_eval/tasks/custom_metrics/enamel_eval.py
index 9b786c0a8..9bbf57293 100644
--- a/bigcode_eval/tasks/custom_metrics/enamel_eval.py
+++ b/bigcode_eval/tasks/custom_metrics/enamel_eval.py
@@ -91,7 +91,7 @@ def evaluate_one(code, problem, tests, refs, k, hardness, n_reps, memory_giga, t
         level_break = False
         for k, test in enumerate(tests_j):
             elapsed = [None for rep in range(n_reps_j)]
-            for rep in range(n_reps):
+            for rep in range(n_reps_j):
                 scope = dict(time = time, input = None, print = None, __input = deepcopy(test.input)) # in case that the code modifies the input
                 try:
                     unsafe_timed_execute(TPL_RUN % (problem.prompt, code, problem.entry_point), scope, memory_bytes, timeout + tolerence_sec)

From 93c47cc4990d8edb861f09f8d520513d644fd573 Mon Sep 17 00:00:00 2001
From: q-rz <100142775+q-rz@users.noreply.github.com>
Date: Mon, 22 Jul 2024 00:25:33 +0000
Subject: [PATCH 21/28] Add a new benchmark ENAMEL

---
 bigcode_eval/tasks/custom_metrics/enamel_eval.py | 16 ++++++++--------
 bigcode_eval/tasks/enamel.py                     |  5 +++--
 2 files changed, 11 insertions(+), 10 deletions(-)

diff --git a/bigcode_eval/tasks/custom_metrics/enamel_eval.py b/bigcode_eval/tasks/custom_metrics/enamel_eval.py
index 9bbf57293..50f9943bd 100644
--- a/bigcode_eval/tasks/custom_metrics/enamel_eval.py
+++ b/bigcode_eval/tasks/custom_metrics/enamel_eval.py
@@ -83,8 +83,9 @@ def find_class(self, module, name):
 def evaluate_one(code, problem, tests, refs, k, hardness, n_reps, memory_giga, timeout_factor, tolerence_sec, time_correction):
     timeout = timeout_factor * refs.ref_max
     memory_bytes = memory_giga * (1024 ** 3)
+    n_levels = len(tests)
+    zero_effs = [0. for j in range(n_levels)]
     effs = []
-    elapsed_list = []
     for j, (size, tests_j) in enumerate(tests):
         n_reps_j = n_reps[j]
         level_elapsed = []
@@ -110,26 +111,25 @@ def evaluate_one(code, problem, tests, refs, k, hardness, n_reps, memory_giga, t
                 except KeyboardInterrupt as e:
                     raise e
                 except BaseException as e:
-                    return False, self.zero_effs(), elapsed_list
+                    return False, zero_effs
                 else:
                     if '__accepted' in scope and scope['__accepted']:
                         elapsed[rep] = scope['__t1'] - scope['__t0']
                     else:
-                        return False, self.zero_effs(), elapsed_list
+                        return False, zero_effs
             if level_break:
                 break
             else:
                 level_elapsed.append(calc_exec_time(elapsed).item() * time_correction)
-        elapsed_list.append(level_elapsed)
         if level_break:
             break
         else:
             effs.append(calc_eff(elapsed = max(level_elapsed), ref = refs.refs[j], timeout = timeout))
     if j == 0 and level_break:
-        return False, self.zero_effs(), elapsed_list
-    for j in range(len(effs), self.n_levels):
+        return False, zero_effs
+    for j in range(len(effs), n_levels):
         effs.append(0.)
-    return True, effs, elapsed_list
+    return True, effs
 
 def get_time_correction(problem, tests, refs, n_reps): # computes the calibration factor of of execution time
     j = refs.lid
@@ -158,7 +158,7 @@ def evaluate_all(problems, codes, tests, refs, k, hardness, n_reps, memory_giga,
         problem_passes = []
         problem_effs = []
         for code in codes_i:
-            passed, code_effs, code_elapsed = evaluate_one(
+            passed, code_effs = evaluate_one(
                 code = code, problem = problem, tests = tests_i, refs = refs_i,
                 k = k, hardness = hardness, n_reps = n_reps, memory_giga = memory_giga,
                 timeout_factor = timeout_factor, tolerence_sec = tolerence_sec, time_correction = time_correction)
diff --git a/bigcode_eval/tasks/enamel.py b/bigcode_eval/tasks/enamel.py
index 34207e70c..6d2c4283f 100644
--- a/bigcode_eval/tasks/enamel.py
+++ b/bigcode_eval/tasks/enamel.py
@@ -42,6 +42,7 @@ def __init__(self, subset, # list of problem IDs
         self.subset = subset if isinstance(subset, list) else list(subset)
         self.n_probs = len(self.subset)
         self.dataset = self.dataset[self.DATASET_ALL].to_pandas().iloc[self.subset, :]
+        self.prob_ids = {row.task_id: i for i, row in enumerate(self.dataset.itertuples(index=False))}
         self.hardness = hardness
         self.n_levels = len(self.hardness)
         self.n_reps = [n_reps if self.hardness[j] else 1 for j in range(self.n_levels)] # no need to repeat if it does not count into the efficiency score
@@ -59,7 +60,7 @@ def __init__(self, subset, # list of problem IDs
 
     def get_dataset(self):
         """Returns dataset as an iterable of namedtuple"""
-        return list(self.dataset.itertuples(index=True))
+        return list(self.dataset.itertuples(index=False))
 
     def get_prompt(self, doc):
         """
@@ -75,7 +76,7 @@ def get_reference(self, doc):
             a row from the dataset
         :return: tuple (problem, tests, refs)
         """
-        i = doc.Index
+        i = self.prob_ids[doc.task_id]
         return (doc, self.tests[i], self.refs[i])
 
     def postprocess_generation(self, generation, idx):

From 6b6163da133b2103263f672491aa5935252a352e Mon Sep 17 00:00:00 2001
From: q-rz <100142775+q-rz@users.noreply.github.com>
Date: Mon, 22 Jul 2024 00:36:02 +0000
Subject: [PATCH 22/28] Add a new benchmark ENAMEL

---
 bigcode_eval/tasks/custom_metrics/enamel_eval.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/bigcode_eval/tasks/custom_metrics/enamel_eval.py b/bigcode_eval/tasks/custom_metrics/enamel_eval.py
index 50f9943bd..f5a9c0764 100644
--- a/bigcode_eval/tasks/custom_metrics/enamel_eval.py
+++ b/bigcode_eval/tasks/custom_metrics/enamel_eval.py
@@ -167,6 +167,7 @@ def evaluate_all(problems, codes, tests, refs, k, hardness, n_reps, memory_giga,
         for j, k_ in enumerate(k):
             passes[j].append(calc_pass_at_k(n = len(problem_passes), c = sum(problem_passes), k = k_))
             effs[j].append(calc_eff_at_k(e = np.average(problem_effs, axis = 1, weights = hardness), k = k_))
+            if effs[j][-1] < 0.98: print(f'{problem.task_id}: eff={effs[j][-1]:.4f}', flush = True)
     metrics = dict()
     for k_, pass_k in zip(k, passes):
         metrics[f'pass@{k_}'] = np.mean(pass_k).item()

From fd7694fbc6430ede2a56f78e9a4d6466c4865dfe Mon Sep 17 00:00:00 2001
From: q-rz <100142775+q-rz@users.noreply.github.com>
Date: Mon, 22 Jul 2024 03:47:11 +0000
Subject: [PATCH 23/28] Add a new benchmark ENAMEL

---
 bigcode_eval/tasks/custom_metrics/enamel_eval.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/bigcode_eval/tasks/custom_metrics/enamel_eval.py b/bigcode_eval/tasks/custom_metrics/enamel_eval.py
index f5a9c0764..57f7b943e 100644
--- a/bigcode_eval/tasks/custom_metrics/enamel_eval.py
+++ b/bigcode_eval/tasks/custom_metrics/enamel_eval.py
@@ -81,7 +81,7 @@ def find_class(self, module, name):
 ''' # % (prompt, checker)
 
 def evaluate_one(code, problem, tests, refs, k, hardness, n_reps, memory_giga, timeout_factor, tolerence_sec, time_correction):
-    timeout = timeout_factor * refs.ref_max
+    timeout = timeout_factor * refs.ref_max / time_correction
     memory_bytes = memory_giga * (1024 ** 3)
     n_levels = len(tests)
     zero_effs = [0. for j in range(n_levels)]
@@ -100,9 +100,11 @@ def evaluate_one(code, problem, tests, refs, k, hardness, n_reps, memory_giga, t
                     scope['__answer'] = test.answer # to prevent the code reading the answer
                     unsafe_execute(TPL_TEST % (problem.prompt, problem.checker), scope) # assuming that the checker does not modify the input
                 except TimeoutException as e:
+                    print(f'TLE: {problem.task_id} level={j} case={k}')##########
                     level_break = True
                     break
                 except MemoryError as e:
+                    print(f'MLE: {problem.task_id} level={j} case={k}')##########
                     level_break = True
                     break
                 except OverflowError as e:
@@ -167,7 +169,7 @@ def evaluate_all(problems, codes, tests, refs, k, hardness, n_reps, memory_giga,
         for j, k_ in enumerate(k):
             passes[j].append(calc_pass_at_k(n = len(problem_passes), c = sum(problem_passes), k = k_))
             effs[j].append(calc_eff_at_k(e = np.average(problem_effs, axis = 1, weights = hardness), k = k_))
-            if effs[j][-1] < 0.98: print(f'{problem.task_id}: eff={effs[j][-1]:.4f}', flush = True)
+            if abs(effs[j][-1] - 1.) > 0.03: print(f'{problem.task_id}: eff={effs[j][-1]:.4f} c={time_correction:.4f}', flush = True)##############
     metrics = dict()
     for k_, pass_k in zip(k, passes):
         metrics[f'pass@{k_}'] = np.mean(pass_k).item()

From cd0810c06d9c8aa21051d4294624b58f872849ac Mon Sep 17 00:00:00 2001
From: q-rz <100142775+q-rz@users.noreply.github.com>
Date: Mon, 22 Jul 2024 04:57:22 +0000
Subject: [PATCH 24/28] Add a new benchmark ENAMEL

---
 bigcode_eval/tasks/custom_metrics/enamel_eval.py | 11 +++++++----
 bigcode_eval/tasks/enamel.py                     |  3 +--
 2 files changed, 8 insertions(+), 6 deletions(-)

diff --git a/bigcode_eval/tasks/custom_metrics/enamel_eval.py b/bigcode_eval/tasks/custom_metrics/enamel_eval.py
index 57f7b943e..d51b01921 100644
--- a/bigcode_eval/tasks/custom_metrics/enamel_eval.py
+++ b/bigcode_eval/tasks/custom_metrics/enamel_eval.py
@@ -81,7 +81,8 @@ def find_class(self, module, name):
 ''' # % (prompt, checker)
 
 def evaluate_one(code, problem, tests, refs, k, hardness, n_reps, memory_giga, timeout_factor, tolerence_sec, time_correction):
-    timeout = timeout_factor * refs.ref_max / time_correction
+    timeout = timeout_factor * refs.ref_max
+    time_limit = timeout / min(time_correction, 1.)
     memory_bytes = memory_giga * (1024 ** 3)
     n_levels = len(tests)
     zero_effs = [0. for j in range(n_levels)]
@@ -95,16 +96,16 @@ def evaluate_one(code, problem, tests, refs, k, hardness, n_reps, memory_giga, t
             for rep in range(n_reps_j):
                 scope = dict(time = time, input = None, print = None, __input = deepcopy(test.input)) # in case that the code modifies the input
                 try:
-                    unsafe_timed_execute(TPL_RUN % (problem.prompt, code, problem.entry_point), scope, memory_bytes, timeout + tolerence_sec)
+                    unsafe_timed_execute(TPL_RUN % (problem.prompt, code, problem.entry_point), scope, memory_bytes, time_limit + tolerence_sec)
                     scope['__input'] = test.input
                     scope['__answer'] = test.answer # to prevent the code reading the answer
                     unsafe_execute(TPL_TEST % (problem.prompt, problem.checker), scope) # assuming that the checker does not modify the input
                 except TimeoutException as e:
-                    print(f'TLE: {problem.task_id} level={j} case={k}')##########
+                    print(f'TLE {problem.task_id} level={j} case={k}')##########
                     level_break = True
                     break
                 except MemoryError as e:
-                    print(f'MLE: {problem.task_id} level={j} case={k}')##########
+                    print(f'MLE {problem.task_id} level={j} case={k}')##########
                     level_break = True
                     break
                 except OverflowError as e:
@@ -113,11 +114,13 @@ def evaluate_one(code, problem, tests, refs, k, hardness, n_reps, memory_giga, t
                 except KeyboardInterrupt as e:
                     raise e
                 except BaseException as e:
+                    print(f'RE {problem.task_id} level={j} case={k} {type(e)} {e}')##########
                     return False, zero_effs
                 else:
                     if '__accepted' in scope and scope['__accepted']:
                         elapsed[rep] = scope['__t1'] - scope['__t0']
                     else:
+                        print(f'WA {problem.task_id} level={j} case={k} {type(e)} {e}')##########
                         return False, zero_effs
             if level_break:
                 break
diff --git a/bigcode_eval/tasks/enamel.py b/bigcode_eval/tasks/enamel.py
index 6d2c4283f..820dd5cb9 100644
--- a/bigcode_eval/tasks/enamel.py
+++ b/bigcode_eval/tasks/enamel.py
@@ -49,7 +49,6 @@ def __init__(self, subset, # list of problem IDs
         self.memory_giga = memory_giga
         self.timeout_factor = timeout_factor
         self.tolerence_sec = tolerence_sec
-        #warn(f"Problems here have been renumbered 0--{self.n_probs - 1} to compatibilize with `bigcode_eval`")
         if self.DATASET_PATH != 'q-rz/enamel':
             warn(f"Tests are loaded from {self.DATASET_PATH}/{tests_path} by `pickle`. Unpickling files from an unknown provider can be unsafe.")
         self.tests_path = hf_hub_download(repo_id = self.DATASET_PATH, filename = tests_path, repo_type = "dataset")
@@ -85,7 +84,7 @@ def postprocess_generation(self, generation, idx):
         :param generation: str
             code generation from LM
         :param idx: int (if needed)
-            index of doc in the dataset to which the generation belongs
+            index of doc in the dataset to which the generation belongs; not needed here
         :return: str
         """
         generation = self._stop_at_stop_token(generation, self.stop_words)

From de62d2048139d5a19eec5e5d0951f0d929cefae9 Mon Sep 17 00:00:00 2001
From: q-rz <100142775+q-rz@users.noreply.github.com>
Date: Mon, 22 Jul 2024 05:04:19 +0000
Subject: [PATCH 25/28] Add a new benchmark ENAMEL

---
 bigcode_eval/tasks/enamel.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/bigcode_eval/tasks/enamel.py b/bigcode_eval/tasks/enamel.py
index 820dd5cb9..5250fa272 100644
--- a/bigcode_eval/tasks/enamel.py
+++ b/bigcode_eval/tasks/enamel.py
@@ -35,7 +35,7 @@ class GeneralENAMEL(GeneralHumanEval):
     DATASET_ALL = "ENAMEL_HumanEval"
 
     def __init__(self, subset, # list of problem IDs
-        hardness=[0., 3., 3., 4.], n_reps = 6, memory_giga=4., timeout_factor=2., tolerence_sec=0.01, tests_path="cache/eval~tests.pkl",
+        hardness=[0., 3., 3., 4.], n_reps = 6, memory_giga=8., timeout_factor=2., tolerence_sec=0.01, tests_path="cache/eval~tests.pkl",
         strip_prompt=True, k=[1, 10, 100],
     ):
         super().__init__(strip_prompt=strip_prompt, k=k, num_workers=1, timeout=None) # each problem has a different time limit

From 80f4e147dcd2ad13876ef5870b5c59abdabcf0b7 Mon Sep 17 00:00:00 2001
From: q-rz <100142775+q-rz@users.noreply.github.com>
Date: Mon, 22 Jul 2024 05:17:46 +0000
Subject: [PATCH 26/28] Add a new benchmark ENAMEL

---
 .../tasks/custom_metrics/enamel_eval.py       | 47 +++++++++----------
 bigcode_eval/tasks/enamel.py                  | 15 +++---
 2 files changed, 27 insertions(+), 35 deletions(-)

diff --git a/bigcode_eval/tasks/custom_metrics/enamel_eval.py b/bigcode_eval/tasks/custom_metrics/enamel_eval.py
index d51b01921..7cf1b82e8 100644
--- a/bigcode_eval/tasks/custom_metrics/enamel_eval.py
+++ b/bigcode_eval/tasks/custom_metrics/enamel_eval.py
@@ -80,9 +80,8 @@ def find_class(self, module, name):
 __accepted = __check(__input, __answer, __output)
 ''' # % (prompt, checker)
 
-def evaluate_one(code, problem, tests, refs, k, hardness, n_reps, memory_giga, timeout_factor, tolerence_sec, time_correction):
+def evaluate_one(code, problem, tests, refs, k, hardness, n_reps, memory_giga, timeout_factor, tolerence_sec):
     timeout = timeout_factor * refs.ref_max
-    time_limit = timeout / min(time_correction, 1.)
     memory_bytes = memory_giga * (1024 ** 3)
     n_levels = len(tests)
     zero_effs = [0. for j in range(n_levels)]
@@ -96,16 +95,14 @@ def evaluate_one(code, problem, tests, refs, k, hardness, n_reps, memory_giga, t
             for rep in range(n_reps_j):
                 scope = dict(time = time, input = None, print = None, __input = deepcopy(test.input)) # in case that the code modifies the input
                 try:
-                    unsafe_timed_execute(TPL_RUN % (problem.prompt, code, problem.entry_point), scope, memory_bytes, time_limit + tolerence_sec)
+                    unsafe_timed_execute(TPL_RUN % (problem.prompt, code, problem.entry_point), scope, memory_bytes, timeout + tolerence_sec)
                     scope['__input'] = test.input
                     scope['__answer'] = test.answer # to prevent the code reading the answer
                     unsafe_execute(TPL_TEST % (problem.prompt, problem.checker), scope) # assuming that the checker does not modify the input
                 except TimeoutException as e:
-                    print(f'TLE {problem.task_id} level={j} case={k}')##########
                     level_break = True
                     break
                 except MemoryError as e:
-                    print(f'MLE {problem.task_id} level={j} case={k}')##########
                     level_break = True
                     break
                 except OverflowError as e:
@@ -114,18 +111,16 @@ def evaluate_one(code, problem, tests, refs, k, hardness, n_reps, memory_giga, t
                 except KeyboardInterrupt as e:
                     raise e
                 except BaseException as e:
-                    print(f'RE {problem.task_id} level={j} case={k} {type(e)} {e}')##########
                     return False, zero_effs
                 else:
                     if '__accepted' in scope and scope['__accepted']:
                         elapsed[rep] = scope['__t1'] - scope['__t0']
                     else:
-                        print(f'WA {problem.task_id} level={j} case={k} {type(e)} {e}')##########
                         return False, zero_effs
             if level_break:
                 break
             else:
-                level_elapsed.append(calc_exec_time(elapsed).item() * time_correction)
+                level_elapsed.append(calc_exec_time(elapsed).item())
         if level_break:
             break
         else:
@@ -136,20 +131,21 @@ def evaluate_one(code, problem, tests, refs, k, hardness, n_reps, memory_giga, t
         effs.append(0.)
     return True, effs
 
-def get_time_correction(problem, tests, refs, n_reps): # computes the calibration factor of of execution time
-    j = refs.lid
-    k = refs.cid
-    test = tests[j][-1][k]
-    n_reps_j = n_reps[j]
-    elapsed = [None for rep in range(n_reps_j)]
-    for rep in range(n_reps_j):
-        scope = dict(time = time, __input = deepcopy(test.input)) # in case that the code modifies the input
-        unsafe_execute(TPL_RUN % (problem.prompt, problem.reference_solution, problem.entry_point), scope) # assuming that the reference solution is error-free
-        elapsed[rep] = scope['__t1'] - scope['__t0']
-    elapsed = calc_exec_time(elapsed).item()
-    return refs.ref_max / elapsed
-
-def evaluate_all(problems, codes, tests, refs, k, hardness, n_reps, memory_giga, timeout_factor, tolerence_sec):
+def compute_refs(problem, tests, n_reps, hardness): # computes the calibration factor of of execution time
+    for j in range(len(tests)):
+        if hardness[j]:
+            for k in range(len(tests[j][-1])):
+                test = tests[j][-1][k]
+                n_reps_j = n_reps[j]
+                elapsed = [None for rep in range(n_reps_j)]
+                for rep in range(n_reps_j):
+                    scope = dict(time = time, __input = deepcopy(test.input)) # in case that the code modifies the input
+                    unsafe_execute(TPL_RUN % (problem.prompt, problem.reference_solution, problem.entry_point), scope) # assuming that the reference solution is error-free
+                    elapsed[rep] = scope['__t1'] - scope['__t0']
+                test.ref = calc_exec_time(elapsed).item()
+    return Refs(tests = tests, hardness = hardness)
+
+def evaluate_all(problems, codes, tests, k, hardness, n_reps, memory_giga, timeout_factor, tolerence_sec):
     if isinstance(k, int):
         k = [k]
     min_codes = min(len(codes_i) for codes_i in codes)
@@ -157,8 +153,8 @@ def evaluate_all(problems, codes, tests, refs, k, hardness, n_reps, memory_giga,
     passes = [[] for k_ in k]
     effs = [[] for k_ in k]
     gc.collect()
-    for problem, codes_i, tests_i, refs_i in zip(problems, codes, tests, refs):
-        time_correction = get_time_correction(problem = problem, tests = tests_i, refs = refs_i, n_reps = n_reps)
+    for problem, codes_i, tests_i in zip(problems, codes, tests):
+        refs_i = compute_refs(problem = problem, tests = tests_i, n_reps = n_reps, hardness = hardness)
         n_levels = len(tests_i)
         problem_passes = []
         problem_effs = []
@@ -166,13 +162,12 @@ def evaluate_all(problems, codes, tests, refs, k, hardness, n_reps, memory_giga,
             passed, code_effs = evaluate_one(
                 code = code, problem = problem, tests = tests_i, refs = refs_i,
                 k = k, hardness = hardness, n_reps = n_reps, memory_giga = memory_giga,
-                timeout_factor = timeout_factor, tolerence_sec = tolerence_sec, time_correction = time_correction)
+                timeout_factor = timeout_factor, tolerence_sec = tolerence_sec)
             problem_passes.append(passed)
             problem_effs.append(code_effs)
         for j, k_ in enumerate(k):
             passes[j].append(calc_pass_at_k(n = len(problem_passes), c = sum(problem_passes), k = k_))
             effs[j].append(calc_eff_at_k(e = np.average(problem_effs, axis = 1, weights = hardness), k = k_))
-            if abs(effs[j][-1] - 1.) > 0.03: print(f'{problem.task_id}: eff={effs[j][-1]:.4f} c={time_correction:.4f}', flush = True)##############
     metrics = dict()
     for k_, pass_k in zip(k, passes):
         metrics[f'pass@{k_}'] = np.mean(pass_k).item()
diff --git a/bigcode_eval/tasks/enamel.py b/bigcode_eval/tasks/enamel.py
index 5250fa272..0ebcce402 100644
--- a/bigcode_eval/tasks/enamel.py
+++ b/bigcode_eval/tasks/enamel.py
@@ -35,7 +35,7 @@ class GeneralENAMEL(GeneralHumanEval):
     DATASET_ALL = "ENAMEL_HumanEval"
 
     def __init__(self, subset, # list of problem IDs
-        hardness=[0., 3., 3., 4.], n_reps = 6, memory_giga=8., timeout_factor=2., tolerence_sec=0.01, tests_path="cache/eval~tests.pkl",
+        hardness=[0., 3., 3., 4.], n_reps = 10, memory_giga=10., timeout_factor=2., tolerence_sec=0.01, tests_path="cache/eval~tests.pkl",
         strip_prompt=True, k=[1, 10, 100],
     ):
         super().__init__(strip_prompt=strip_prompt, k=k, num_workers=1, timeout=None) # each problem has a different time limit
@@ -53,9 +53,8 @@ def __init__(self, subset, # list of problem IDs
             warn(f"Tests are loaded from {self.DATASET_PATH}/{tests_path} by `pickle`. Unpickling files from an unknown provider can be unsafe.")
         self.tests_path = hf_hub_download(repo_id = self.DATASET_PATH, filename = tests_path, repo_type = "dataset")
         with open(self.tests_path, 'rb') as fi:
-            tests_all, refs_all = EnamUnpickler(fi).load()
+            tests_all, _ = EnamUnpickler(fi).load()
             self.tests = [tests_all[i] for i in self.subset]
-            self.refs = [refs_all[i] for i in self.subset]
 
     def get_dataset(self):
         """Returns dataset as an iterable of namedtuple"""
@@ -73,10 +72,10 @@ def get_reference(self, doc):
         """
         :param doc: namedtuple
             a row from the dataset
-        :return: tuple (problem, tests, refs)
+        :return: tuple (problem, tests)
         """
         i = self.prob_ids[doc.task_id]
-        return (doc, self.tests[i], self.refs[i])
+        return doc, self.tests[i]
 
     def postprocess_generation(self, generation, idx):
         """
@@ -105,13 +104,11 @@ def process_results(self, generations, references):
         """
         problems = []
         tests = []
-        refs = []
-        for problem, tests_i, refs_i in references:
+        for problem, tests_i in references:
             problems.append(problem)
             tests.append(tests_i)
-            refs.append(refs_i)
         return evaluate_all(
-            problems=problems, codes=generations, tests=tests, refs=refs,
+            problems=problems, codes=generations, tests=tests,
             k=self.k, hardness=self.hardness, n_reps=self.n_reps,
             memory_giga=self.memory_giga, timeout_factor=self.timeout_factor, tolerence_sec=self.tolerence_sec,
         )

From 71094336479a59b1d127b46c553bf4ed5ff9eaed Mon Sep 17 00:00:00 2001
From: q-rz <100142775+q-rz@users.noreply.github.com>
Date: Mon, 22 Jul 2024 05:22:22 +0000
Subject: [PATCH 27/28] Add a new benchmark ENAMEL

---
 bigcode_eval/tasks/enamel.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/bigcode_eval/tasks/enamel.py b/bigcode_eval/tasks/enamel.py
index 0ebcce402..35c7be212 100644
--- a/bigcode_eval/tasks/enamel.py
+++ b/bigcode_eval/tasks/enamel.py
@@ -35,7 +35,7 @@ class GeneralENAMEL(GeneralHumanEval):
     DATASET_ALL = "ENAMEL_HumanEval"
 
     def __init__(self, subset, # list of problem IDs
-        hardness=[0., 3., 3., 4.], n_reps = 10, memory_giga=10., timeout_factor=2., tolerence_sec=0.01, tests_path="cache/eval~tests.pkl",
+        hardness=[0., 3., 3., 4.], n_reps = 6, memory_giga=10., timeout_factor=2., tolerence_sec=0.01, tests_path="cache/eval~tests.pkl",
         strip_prompt=True, k=[1, 10, 100],
     ):
         super().__init__(strip_prompt=strip_prompt, k=k, num_workers=1, timeout=None) # each problem has a different time limit

From cca02b2431bee950cd5248272394a58d3d264325 Mon Sep 17 00:00:00 2001
From: Ruizhong Qiu <100142775+q-rz@users.noreply.github.com>
Date: Sun, 21 Jul 2024 23:13:18 -0700
Subject: [PATCH 28/28] Update README.md

---
 docs/README.md | 21 +++++++++++++++++++++
 1 file changed, 21 insertions(+)

diff --git a/docs/README.md b/docs/README.md
index 903c6a122..e652061bf 100644
--- a/docs/README.md
+++ b/docs/README.md
@@ -426,6 +426,27 @@ accelerate launch main.py  \
     --metric_output_path <MODEL_NAME>.json
 ```
 
+## ENAMEL
+
+[ENAMEL](https://github.com/q-rz/enamel) is a rigorous & high-standard benchmark for evaluating the efficiency of generated Python code under large-scale inputs. It supports a new efficiency metric called eff@k, which generalizes the pass@k metric. Besides that, it provides expert-written reference solutions and expert-written test case generators, thus setting a high-standard for efficiency evaluation. See [this paper](https://arxiv.org/abs/2406.06647) for detail.
+
+**Notice:** It is NOT recommended to use multiple threads or processes in efficiency evaluation. That can negatively affect efficiency results.
+
+```python
+accelerate launch main.py \
+  --model <MODEL_NAME> \
+  --max_length_generation 2048 \
+  --tasks enamel \
+  --temperature 0.8 \
+  --top_p 0.95 \
+  --do_sample True \
+  --n_samples 10 \
+  --batch_size 10 \
+  --allow_code_execution
+```
+
+This implementation also supports the two subsets Algo and Impl in the paper: `--task enamel-algo` / `--task enamel-impl`.
+
 ## Code generation benchmarks without unit tests
 
 For these tasks, we do single generations and compare the generated code against reference solutions and compute BLEU score. For the following tasks, we use a two-shot setting where we include 2 inputs and their solutions in the prompt, all preceded by an instruction such as: ` "Answer the following instructions in a one line SQL query:\n"`. The solutions consist of one line so we stop the generation when a new line is generated. 3 languages are present: Python, SQL and Java.