test_examples: use baseline fixture

Signed-off-by: U. Artie Eoff <[email protected]>
uartie · Feb 21, 2025 · f087196 · f087196
1 parent c680cc9
commit f087196
Show file tree

Hide file tree

Showing 37 changed files with 514 additions and 66 deletions.
diff --git a/tests/baselines/fixture/tests/test_examples.json b/tests/baselines/fixture/tests/test_examples.json
diff --git a/.../baselines/CodeLlama_13b_Instruct_hf.json → ...s/examples/CodeLlama_13b_Instruct_hf.json b/.../baselines/CodeLlama_13b_Instruct_hf.json → ...s/examples/CodeLlama_13b_Instruct_hf.json
diff --git a/tests/baselines/LlamaGuard_7b.json → tests/configs/examples/LlamaGuard_7b.json b/tests/baselines/LlamaGuard_7b.json → tests/configs/examples/LlamaGuard_7b.json
diff --git a/tests/baselines/Llama_3_1_8B.json → tests/configs/examples/Llama_3_1_8B.json b/tests/baselines/Llama_3_1_8B.json → tests/configs/examples/Llama_3_1_8B.json
diff --git a/...elines/Llama_3_2_11B_Vision_Instruct.json → ...amples/Llama_3_2_11B_Vision_Instruct.json b/...elines/Llama_3_2_11B_Vision_Instruct.json → ...amples/Llama_3_2_11B_Vision_Instruct.json
diff --git a/tests/baselines/Qwen2_72B.json → tests/configs/examples/Qwen2_72B.json b/tests/baselines/Qwen2_72B.json → tests/configs/examples/Qwen2_72B.json
diff --git a/tests/baselines/Qwen2_7B.json → tests/configs/examples/Qwen2_7B.json b/tests/baselines/Qwen2_7B.json → tests/configs/examples/Qwen2_7B.json
diff --git a/tests/baselines/albert_large_v2.json → tests/configs/examples/albert_large_v2.json b/tests/baselines/albert_large_v2.json → tests/configs/examples/albert_large_v2.json
@@ -1,5 +1,5 @@
 {
-    "gaudi": {
+    "gaudi1": {
         "squad": {
             "num_train_epochs": 2,
             "eval_batch_size": 4,
@@ -59,4 +59,4 @@
             }
         }
     }
-}
+}
diff --git a/tests/baselines/albert_xxlarge_v1.json → ...s/configs/examples/albert_xxlarge_v1.json b/tests/baselines/albert_xxlarge_v1.json → ...s/configs/examples/albert_xxlarge_v1.json
@@ -1,5 +1,5 @@
 {
-    "gaudi": {
+    "gaudi1": {
         "squad": {
             "num_train_epochs": 1,
             "eval_batch_size": 2,

diff --git a/...nes/ast_finetuned_speech_commands_v2.json → ...les/ast_finetuned_speech_commands_v2.json b/...nes/ast_finetuned_speech_commands_v2.json → ...les/ast_finetuned_speech_commands_v2.json
diff --git a/tests/baselines/bert_base_uncased.json → ...s/configs/examples/bert_base_uncased.json b/tests/baselines/bert_base_uncased.json → ...s/configs/examples/bert_base_uncased.json
diff --git a/...ert_large_uncased_whole_word_masking.json → ...ert_large_uncased_whole_word_masking.json b/...ert_large_uncased_whole_word_masking.json → ...ert_large_uncased_whole_word_masking.json
@@ -1,5 +1,5 @@
 {
-    "gaudi": {
+    "gaudi1": {
         "squad": {
             "num_train_epochs": 1,
             "eval_batch_size": 8,

diff --git a/tests/baselines/bloom_7b1.json → tests/configs/examples/bloom_7b1.json b/tests/baselines/bloom_7b1.json → tests/configs/examples/bloom_7b1.json
@@ -1,5 +1,5 @@
 {
-    "gaudi": {
+    "gaudi1": {
         "wikitext": {
             "num_train_epochs": 3,
             "eval_batch_size": 4,

diff --git a/...elines/bridgetower_large_itm_mlm_itc.json → ...amples/bridgetower_large_itm_mlm_itc.json b/...elines/bridgetower_large_itm_mlm_itc.json → ...amples/bridgetower_large_itm_mlm_itc.json
diff --git a/tests/baselines/chatglm3_6b.json → tests/configs/examples/chatglm3_6b.json b/tests/baselines/chatglm3_6b.json → tests/configs/examples/chatglm3_6b.json
diff --git a/tests/baselines/clip_roberta.json → tests/configs/examples/clip_roberta.json b/tests/baselines/clip_roberta.json → tests/configs/examples/clip_roberta.json
@@ -1,5 +1,5 @@
 {
-    "gaudi": {
+    "gaudi1": {
         "ydshieh/coco_dataset_script": {
             "num_train_epochs": 1,
             "eval_batch_size": 64,

diff --git a/tests/baselines/distilbert_base_uncased.json → ...igs/examples/distilbert_base_uncased.json b/tests/baselines/distilbert_base_uncased.json → ...igs/examples/distilbert_base_uncased.json
@@ -1,5 +1,5 @@
 {
-    "gaudi": {
+    "gaudi1": {
         "squad": {
             "num_train_epochs": 1,
             "eval_batch_size": 8,

diff --git a/tests/baselines/falcon_40b.json → tests/configs/examples/falcon_40b.json b/tests/baselines/falcon_40b.json → tests/configs/examples/falcon_40b.json
diff --git a/tests/baselines/flan_t5_xxl.json → tests/configs/examples/flan_t5_xxl.json b/tests/baselines/flan_t5_xxl.json → tests/configs/examples/flan_t5_xxl.json
diff --git a/tests/baselines/gemma_2b_it.json → tests/configs/examples/gemma_2b_it.json b/tests/baselines/gemma_2b_it.json → tests/configs/examples/gemma_2b_it.json
diff --git a/tests/baselines/gemma_2b_it_eager.json → ...s/configs/examples/gemma_2b_it_eager.json b/tests/baselines/gemma_2b_it_eager.json → ...s/configs/examples/gemma_2b_it_eager.json
diff --git a/tests/baselines/gpt2.json → tests/configs/examples/gpt2.json b/tests/baselines/gpt2.json → tests/configs/examples/gpt2.json
@@ -1,5 +1,5 @@
 {
-    "gaudi": {
+    "gaudi1": {
         "wikitext": {
             "num_train_epochs": 2,
             "eval_batch_size": 4,

diff --git a/tests/baselines/gpt2_xl.json → tests/configs/examples/gpt2_xl.json b/tests/baselines/gpt2_xl.json → tests/configs/examples/gpt2_xl.json
@@ -1,5 +1,5 @@
 {
-    "gaudi": {
+    "gaudi1": {
         "wikitext": {
             "num_train_epochs": 2,
             "eval_batch_size": 4,

diff --git a/tests/baselines/gpt_neox_20b.json → tests/configs/examples/gpt_neox_20b.json b/tests/baselines/gpt_neox_20b.json → tests/configs/examples/gpt_neox_20b.json
diff --git a/tests/baselines/idefics2_8b.json → tests/configs/examples/idefics2_8b.json b/tests/baselines/idefics2_8b.json → tests/configs/examples/idefics2_8b.json
diff --git a/tests/baselines/llama_7b.json → tests/configs/examples/llama_7b.json b/tests/baselines/llama_7b.json → tests/configs/examples/llama_7b.json
@@ -1,5 +1,5 @@
 {
-    "gaudi": {
+    "gaudi1": {
         "databricks/databricks-dolly-15k": {
             "num_train_epochs": 1,
             "eval_batch_size": 2,

diff --git a/tests/baselines/llava_1_5_7b_hf.json → tests/configs/examples/llava_1_5_7b_hf.json b/tests/baselines/llava_1_5_7b_hf.json → tests/configs/examples/llava_1_5_7b_hf.json
diff --git a/..._esm1b_for_sequential_classification.json → ..._esm1b_for_sequential_classification.json b/..._esm1b_for_sequential_classification.json → ..._esm1b_for_sequential_classification.json
diff --git a/tests/baselines/roberta_base.json → tests/configs/examples/roberta_base.json b/tests/baselines/roberta_base.json → tests/configs/examples/roberta_base.json
@@ -1,5 +1,5 @@
 {
-    "gaudi": {
+    "gaudi1": {
         "squad": {
             "num_train_epochs": 1,
             "eval_batch_size": 8,

diff --git a/tests/baselines/roberta_large.json → tests/configs/examples/roberta_large.json b/tests/baselines/roberta_large.json → tests/configs/examples/roberta_large.json
@@ -1,5 +1,5 @@
 {
-    "gaudi": {
+    "gaudi1": {
         "squad": {
             "num_train_epochs": 1,
             "eval_batch_size": 8,

diff --git a/...s/swin_base_patch4_window7_224_in22k.json → ...s/swin_base_patch4_window7_224_in22k.json b/...s/swin_base_patch4_window7_224_in22k.json → ...s/swin_base_patch4_window7_224_in22k.json
@@ -1,5 +1,5 @@
 {
-    "gaudi": {
+    "gaudi1": {
         "cifar10": {
             "num_train_epochs": 1,
             "eval_batch_size": 64,

diff --git a/tests/baselines/t5_small.json → tests/configs/examples/t5_small.json b/tests/baselines/t5_small.json → tests/configs/examples/t5_small.json
@@ -1,5 +1,5 @@
 {
-    "gaudi": {
+    "gaudi1": {
         "cnn_dailymail": {
             "num_train_epochs": 1,
             "eval_batch_size": 4,

diff --git a/...baselines/vit_base_patch16_224_in21k.json → .../examples/vit_base_patch16_224_in21k.json b/...baselines/vit_base_patch16_224_in21k.json → .../examples/vit_base_patch16_224_in21k.json
@@ -1,5 +1,5 @@
 {
-    "gaudi": {
+    "gaudi1": {
         "cifar10": {
             "num_train_epochs": 1,
             "eval_batch_size": 64,

diff --git a/tests/baselines/wav2vec2_base.json → tests/configs/examples/wav2vec2_base.json b/tests/baselines/wav2vec2_base.json → tests/configs/examples/wav2vec2_base.json
@@ -1,5 +1,5 @@
 {
-    "gaudi": {
+    "gaudi1": {
         "common_language": {
             "num_train_epochs": 10,
             "eval_batch_size": 64,

diff --git a/tests/baselines/wav2vec2_large_lv60.json → ...configs/examples/wav2vec2_large_lv60.json b/tests/baselines/wav2vec2_large_lv60.json → ...configs/examples/wav2vec2_large_lv60.json
@@ -1,5 +1,5 @@
 {
-    "gaudi": {
+    "gaudi1": {
         "regisss/librispeech_asr_for_optimum_habana_ci": {
             "num_train_epochs": 2,
             "eval_batch_size": 8,

diff --git a/tests/baselines/whisper_small.json → tests/configs/examples/whisper_small.json b/tests/baselines/whisper_small.json → tests/configs/examples/whisper_small.json
@@ -1,5 +1,5 @@
 {
-    "gaudi": {
+    "gaudi1": {
         "mozilla-foundation/common_voice_11_0": {
             "num_train_epochs": 10,
             "eval_batch_size": 2,

diff --git a/tests/test_examples.py b/tests/test_examples.py
@@ -14,7 +14,10 @@
 # limitations under the License.
 
 import json
+import logging
+import operator
 import os
+import pytest
 import re
 import subprocess
 from distutils.util import strtobool
@@ -54,15 +57,15 @@
 )
 
 
-BASELINE_DIRECTORY = Path(__file__).parent.resolve() / Path("baselines")
+CONFIG_DIRECTORY = Path(__file__).parent.resolve() / Path("configs") / Path("examples")
 # Models should reach at least 99% of their baseline accuracy
 ACCURACY_PERF_FACTOR = 0.99
 # Trainings/Evaluations should last at most 5% longer than the baseline
 TIME_PERF_FACTOR = 1.05
 
 
 IS_GAUDI2 = bool("gaudi2" == OH_DEVICE_CONTEXT)
-
+IS_GAUDI1 = bool("gaudi1" == OH_DEVICE_CONTEXT)
 
 def _get_supported_models_for_script(
     models_to_test: Dict[str, List[Tuple[str]]],
@@ -454,29 +457,28 @@ def test(self):
 
             self._install_requirements(example_script.parent / "requirements.txt")
 
-            # collect baseline from <model_name>_eager.json if eager_mode is True
+            # collect test_config from <model_name>_eager.json if eager_mode is True
             if self.EAGER_MODE:
-                baseline_name = model_name.split("/")[-1].replace("-", "_").replace(".", "_") + "_eager"
+                config_name = model_name.split("/")[-1].replace("-", "_").replace(".", "_") + "_eager"
             else:
-                baseline_name = model_name.split("/")[-1].replace("-", "_").replace(".", "_")
+                config_name = model_name.split("/")[-1].replace("-", "_").replace(".", "_")
 
-            path_to_baseline = BASELINE_DIRECTORY / Path(baseline_name).with_suffix(".json")
+            path_to_config = CONFIG_DIRECTORY / Path(config_name).with_suffix(".json")
 
-            with path_to_baseline.open("r") as json_file:
-                device = "gaudi2" if IS_GAUDI2 else "gaudi"
-                baseline = json.load(json_file)[device]
+            with path_to_config.open("r") as json_file:
+                test_config = json.load(json_file)[OH_DEVICE_CONTEXT]
                 if isinstance(self.TASK_NAME, list):
                     for key in self.TASK_NAME:
-                        if key in baseline:
-                            baseline = baseline[key]
+                        if key in test_config:
+                            test_config = test_config[key]
                             break
-                    if "num_train_epochs" not in baseline:
+                    if "num_train_epochs" not in test_config:
                         raise ValueError(
-                            f"Couldn't find a baseline associated to any of these tasks: {self.TASK_NAME}."
+                            f"Couldn't find a test config associated to any of these tasks: {self.TASK_NAME}."
                         )
                     self.TASK_NAME = key
                 else:
-                    baseline = baseline[self.TASK_NAME]
+                    test_config = test_config[self.TASK_NAME]
 
             distribution = "single_card"
             if multi_card:
@@ -507,7 +509,7 @@ def test(self):
             if fp8 and "llama" in model_name:
                 env_variables["PT_HPU_AUTOCAST_LOWER_PRECISION_OPS_LIST"] = str(example_script.parent / "ops_bf16.txt")
 
-            extra_command_line_arguments = baseline.get("distribution").get(distribution).get("extra_arguments", [])
+            extra_command_line_arguments = test_config.get("distribution").get(distribution).get("extra_arguments", [])
 
             if self.EAGER_MODE:
                 env_variables["PT_HPU_LAZY_MODE"] = "0"
@@ -569,10 +571,10 @@ def test(self):
                     gaudi_config_name,
                     tmp_dir,
                     task=self.TASK_NAME,
-                    lr=baseline.get("distribution").get(distribution).get("learning_rate"),
-                    train_batch_size=baseline.get("distribution").get(distribution).get("train_batch_size"),
-                    eval_batch_size=baseline.get("eval_batch_size"),
-                    num_epochs=baseline.get("num_train_epochs"),
+                    lr=test_config.get("distribution").get(distribution).get("learning_rate"),
+                    train_batch_size=test_config.get("distribution").get(distribution).get("train_batch_size"),
+                    eval_batch_size=test_config.get("eval_batch_size"),
+                    num_epochs=test_config.get("num_train_epochs"),
                     extra_command_line_arguments=extra_command_line_arguments,
                 )
                 print(f"\n\nCommand to test: {' '.join(cmd_line[:])}\n")
@@ -585,7 +587,7 @@ def test(self):
                 with open(Path(tmp_dir) / "all_results.json") as fp:
                     results = json.load(fp)
                 # Ensure performance requirements (accuracy, training time) are met
-                self.assert_no_regression(results, baseline.get("distribution").get(distribution), model_name)
+                self.assert_no_regression(results, test_config.get("distribution").get(distribution).get("metrics"), model_name)
 
             # TODO: is a cleanup of the dataset cache needed?
             # self._cleanup_dataset_cache()
@@ -612,17 +614,24 @@ class ExampleTesterBase(TestCase):
     DATASET_PARAMETER_NAME = "dataset_name"
     DATASET_NAME = None
     REGRESSION_METRICS = {
-        "eval_f1": (TestCase.assertGreaterEqual, ACCURACY_PERF_FACTOR),
-        "eval_accuracy": (TestCase.assertGreaterEqual, ACCURACY_PERF_FACTOR),
-        "perplexity": (TestCase.assertLessEqual, 2 - ACCURACY_PERF_FACTOR),
-        "eval_rougeLsum": (TestCase.assertGreaterEqual, ACCURACY_PERF_FACTOR),
-        "train_runtime": (TestCase.assertLessEqual, TIME_PERF_FACTOR),
-        "eval_wer": (TestCase.assertLessEqual, 2 - ACCURACY_PERF_FACTOR),
-        "train_samples_per_second": (TestCase.assertGreaterEqual, 2 - TIME_PERF_FACTOR),
-        "eval_samples_per_second": (TestCase.assertGreaterEqual, 2 - TIME_PERF_FACTOR),
+        "eval_f1": (operator.ge, ACCURACY_PERF_FACTOR),
+        "eval_accuracy": (operator.ge, ACCURACY_PERF_FACTOR),
+        "perplexity": (operator.le, 2 - ACCURACY_PERF_FACTOR),
+        "eval_rougeLsum": (operator.ge, ACCURACY_PERF_FACTOR),
+        "train_runtime": (operator.le, TIME_PERF_FACTOR),
+        "eval_wer": (operator.le, 2 - ACCURACY_PERF_FACTOR),
+        "train_samples_per_second": (operator.ge, 2 - TIME_PERF_FACTOR),
+        "eval_samples_per_second": (operator.ge, 2 - TIME_PERF_FACTOR),
     }
     EAGER_MODE = False
 
+    @pytest.fixture(autouse=True)
+    def _use_(self, baseline):
+        """
+        https://docs.pytest.org/en/stable/how-to/unittest.html#using-autouse-fixtures-and-accessing-other-fixtures
+        """
+        self.baseline = baseline
+
     def _create_command_line(
         self,
         multi_card: bool,
@@ -717,20 +726,18 @@ def _install_requirements(self, requirements_filename: Union[str, os.PathLike]):
         return_code = p.wait()
         self.assertEqual(return_code, 0)
 
-    def assert_no_regression(self, results: Dict, baseline: Dict, model_name: str):
+    def assert_no_regression(self, results: Dict, metrics: list, model_name: str):
         """
         Assert whether all possible performance requirements are met.
         Attributes:
             results (Dict): results of the run to assess
             baseline (Dict): baseline to assert whether or not there is regression
         """
         # Gather all the metrics to assess
-        metrics_to_assess = []
-        for metric_name in self.REGRESSION_METRICS.keys():
-            if metric_name in baseline and metric_name in results:
-                metrics_to_assess.append(metric_name)
-        # There is no accuracy metric for `run_clip.py`, `run_bridgetower.py` and BLOOM
+        metrics_to_assess = list(set(self.REGRESSION_METRICS.keys()) & set(metrics) & set(results.keys()))
         min_number_metrics = 3
+
+        # There is no accuracy metric for `run_clip.py`, `run_bridgetower.py` and BLOOM
         if (
             self.EXAMPLE_NAME in ["run_clip", "run_bridgetower", "sft", "dpo", "ppo", "reward_modeling"]
             or "bloom" in model_name
@@ -745,25 +752,26 @@ def assert_no_regression(self, results: Dict, baseline: Dict, model_name: str):
             (
                 f"{len(metrics_to_assess)} asserted metric(s) while at least 3 are expected (throughput + training"
                 f" time + accuracy). Metrics to assert: {self.REGRESSION_METRICS.keys()}. Metrics received:"
-                f" {baseline.keys()}"
+                f" {metrics}"
             ),
         )
 
-        # Message to display if one test fails
-        # This enables to show all the results and baselines even if one test fails before others
-        failure_message = "\n===== Assessed metrics (measured vs thresholded baseline) =====\n"
-        for metric_name in metrics_to_assess:
-            failure_message += f"{metric_name}: {results[metric_name]} vs {self.REGRESSION_METRICS[metric_name][1] * baseline[metric_name]}\n"
-
         # Assess metrics
+        passed = True
         for metric_name in metrics_to_assess:
-            assert_function, threshold_factor = self.REGRESSION_METRICS[metric_name]
-            assert_function(
-                self,
-                results[metric_name],
-                threshold_factor * baseline[metric_name],
-                msg=f"for metric {metric_name}. {failure_message}",
-            )
+            fn, threshold = self.REGRESSION_METRICS[metric_name]
+            def check(actual, ref):
+                check.msg = f"{metric_name}: {fn.__name__}({actual}, {threshold} * {ref})\n"
+                return fn(actual, threshold * ref)
+            check.msg = ""
+
+            try:
+                self.baseline.assertRef(compare=check, context=[OH_DEVICE_CONTEXT], **{metric_name:results[metric_name]})
+            except Exception as e:
+                logging.getLogger().error(check.msg)
+                passed = False
+
+        assert passed, f"One or more metrics failed"
 
 
 class TextClassificationExampleTester(ExampleTesterBase, metaclass=ExampleTestMeta, example_name="run_glue"):