diff --git a/tests/baselines/fixture/tests/test_examples.json b/tests/baselines/fixture/tests/test_examples.json
new file mode 100644
index 0000000000..d1820727f8
--- /dev/null
+++ b/tests/baselines/fixture/tests/test_examples.json
@@ -0,0 +1,440 @@
+{
+  "tests/test_examples.py::CausalLanguageModelingExampleTester::test_run_clm_gemma-2b-it_single_card": {
+    "gaudi2": {
+      "perplexity": 26.39,
+      "train_runtime": 356.07,
+      "train_samples_per_second": 14.06
+    }
+  },
+  "tests/test_examples.py::CausalLanguageModelingLORAExampleTester::test_run_lora_clm_llama-7b_single_card": {
+    "gaudi1": {
+      "perplexity": 3.9168,
+      "train_runtime": 132.665,
+      "train_samples_per_second": 2.295
+    },
+    "gaudi2": {
+      "perplexity": 3.8436,
+      "train_runtime": 113.9713,
+      "train_samples_per_second": 18.428
+    }
+  },
+  "tests/test_examples.py::DeepSpeedTextClassificationExampleTester::test_run_glue_LlamaGuard-7b_deepspeed": {
+    "gaudi2": {
+      "eval_f1": 0.8873483535528596,
+      "train_runtime": 62.4539,
+      "train_samples_per_second": 342.169
+    }
+  },
+  "tests/test_examples.py::DeepspeedCausalLanguageModelingExampleTester::test_run_clm_CodeLlama-13b-Instruct-hf_deepspeed": {
+    "gaudi2": {
+      "perplexity": 6.877496628184696,
+      "train_runtime": 542.2985,
+      "train_samples_per_second": 18.789
+    }
+  },
+  "tests/test_examples.py::DeepspeedCausalLanguageModelingExampleTester::test_run_clm_bloom-7b1_deepspeed": {
+    "gaudi1": {
+      "train_runtime": 1556.481,
+      "train_samples_per_second": 4.757
+    }
+  },
+  "tests/test_examples.py::DeepspeedCausalLanguageModelingExampleTester::test_run_clm_chatglm3-6b_deepspeed": {
+    "gaudi2": {
+      "perplexity": 16.51629,
+      "train_runtime": 445,
+      "train_samples_per_second": 18.216
+    }
+  },
+  "tests/test_examples.py::DeepspeedCausalLanguageModelingExampleTester::test_run_clm_gemma-2b-it_deepspeed": {
+    "gaudi2": {
+      "perplexity": 924.062,
+      "train_runtime": 75.518,
+      "train_samples_per_second": 81.097
+    }
+  },
+  "tests/test_examples.py::DeepspeedCausalLanguageModelingExampleTester::test_run_clm_gpt-neox-20b_deepspeed": {
+    "gaudi2": {
+      "perplexity": 8.169664686471043,
+      "train_runtime": 445,
+      "train_samples_per_second": 7.328
+    }
+  },
+  "tests/test_examples.py::DeepspeedCausalLanguageModelingExampleTester::test_run_clm_gpt2-xl_deepspeed": {
+    "gaudi1": {
+      "perplexity": 12.6744,
+      "train_runtime": 366.8694,
+      "train_samples_per_second": 16.464
+    },
+    "gaudi2": {
+      "perplexity": 13.237754028004865,
+      "train_runtime": 206.5775,
+      "train_samples_per_second": 95.539
+    }
+  },
+  "tests/test_examples.py::DeepspeedSFTExampleTester::test_sft_Qwen2-72B_deepspeed": {
+    "gaudi2": {
+      "perplexity": 3.7020898897918824,
+      "train_runtime": 918.8018,
+      "train_samples_per_second": 7.554
+    }
+  },
+  "tests/test_examples.py::DeepspeedSummarizationExampleTester::test_run_summarization_flan-t5-xxl_deepspeed": {
+    "gaudi2": {
+      "eval_rougeLsum": 29.308,
+      "train_runtime": 155.86,
+      "train_samples_per_second": 28.387
+    }
+  },
+  "tests/test_examples.py::EagerModeCausalLanguageModelingExampleTester::test_run_clm_gemma-2b-it_single_card": {
+    "gaudi2": {
+      "perplexity": 26.69,
+      "train_runtime": 560.8188,
+      "train_samples_per_second": 8.597
+    }
+  },
+  "tests/test_examples.py::ImageClassificationExampleTester::test_run_image_classification_swin-base-patch4-window7-224-in22k_single_card": {
+    "gaudi1": {
+      "eval_accuracy": 0.9871,
+      "train_runtime": 246.4134,
+      "train_samples_per_second": 212.722
+    },
+    "gaudi2": {
+      "eval_accuracy": 0.9850666666666666,
+      "train_runtime": 77.8934,
+      "train_samples_per_second": 826.766
+    }
+  },
+  "tests/test_examples.py::ImageClassificationExampleTester::test_run_image_classification_vit-base-patch16-224-in21k_single_card": {
+    "gaudi1": {
+      "eval_accuracy": 0.9812,
+      "train_runtime": 136.9418,
+      "train_samples_per_second": 359.584
+    },
+    "gaudi2": {
+      "eval_accuracy": 0.9690666666666666,
+      "train_runtime": 54.9734,
+      "train_samples_per_second": 870.272
+    }
+  },
+  "tests/test_examples.py::MultiCardAudioClassificationExampleTester::test_run_audio_classification_ast-finetuned-speech-commands-v2_multi_card": {
+    "gaudi2": {
+      "eval_accuracy": 0.1871,
+      "eval_samples_per_second": 2301.088,
+      "train_runtime": 139.9477,
+      "train_samples_per_second": 1955.74
+    }
+  },
+  "tests/test_examples.py::MultiCardAudioClassificationExampleTester::test_run_audio_classification_wav2vec2-base_multi_card": {
+    "gaudi1": {
+      "eval_accuracy": 0.8013,
+      "eval_samples_per_second": 329.12,
+      "train_runtime": 366.8081,
+      "train_samples_per_second": 716.385
+    },
+    "gaudi2": {
+      "eval_accuracy": 0.7228,
+      "eval_samples_per_second": 3640.021,
+      "train_runtime": 63.4079,
+      "train_samples_per_second": 2975.844
+    }
+  },
+  "tests/test_examples.py::MultiCardBridgetowerExampleTester::test_run_bridgetower_bridgetower-large-itm-mlm-itc_multi_card": {
+    "gaudi2": {
+      "train_runtime": 224.42,
+      "train_samples_per_second": 904.93
+    }
+  },
+  "tests/test_examples.py::MultiCardCausalLanguageModelingAdaloraExampleTester::test_run_lora_clm_llama-7b_multi_card": {
+    "gaudi2": {
+      "perplexity": 2.59,
+      "train_runtime": 459,
+      "train_samples_per_second": 107
+    }
+  },
+  "tests/test_examples.py::MultiCardCausalLanguageModelingExampleTester::test_run_clm_gemma-2b-it_multi_card": {
+    "gaudi2": {
+      "perplexity": 954.5995,
+      "train_runtime": 82.6617,
+      "train_samples_per_second": 94.524
+    }
+  },
+  "tests/test_examples.py::MultiCardCausalLanguageModelingIA3ExampleTester::test_run_lora_clm_llama-7b_multi_card": {
+    "gaudi2": {
+      "perplexity": 3.3,
+      "train_runtime": 262.8,
+      "train_samples_per_second": 161
+    }
+  },
+  "tests/test_examples.py::MultiCardCausalLanguageModelingLORAExampleTester2::test_run_lora_clm_falcon-40b_multi_card": {
+    "gaudi2": {
+      "perplexity": 1.6,
+      "train_runtime": 710,
+      "train_samples_per_second": 15.0
+    }
+  },
+  "tests/test_examples.py::MultiCardCausalLanguageModelingLORAExampleTester2::test_run_lora_clm_llama-7b_multi_card": {
+    "gaudi2": {
+      "perplexity": 2.3665,
+      "train_runtime": 294.5707,
+      "train_samples_per_second": 148.093
+    }
+  },
+  "tests/test_examples.py::MultiCardCausalLanguageModelingLORAExampleTester::test_run_lora_clm_falcon-40b_multi_card": {
+    "gaudi2": {
+      "perplexity": 4.0,
+      "train_runtime": 550,
+      "train_samples_per_second": 15.0
+    }
+  },
+  "tests/test_examples.py::MultiCardCausalLanguageModelingLORAExampleTester::test_run_lora_clm_llama-7b_multi_card": {
+    "gaudi1": {
+      "perplexity": 2.7542,
+      "train_runtime": 538.0159,
+      "train_samples_per_second": 20.397
+    },
+    "gaudi2": {
+      "perplexity": 2.3665,
+      "train_runtime": 294.5707,
+      "train_samples_per_second": 148.093
+    }
+  },
+  "tests/test_examples.py::MultiCardCausalLanguageModelingLORAFSDPCompileExampleTester::test_run_lora_clm_llama-7b_multi_card": {
+    "gaudi2": {
+      "perplexity": 2.4259,
+      "train_runtime": 186.2483,
+      "train_samples_per_second": 93.5
+    }
+  },
+  "tests/test_examples.py::MultiCardCausalLanguageModelingLlamaAdapterExampleTester::test_run_lora_clm_llama-7b_multi_card": {
+    "gaudi2": {
+      "perplexity": 5.575,
+      "train_runtime": 131.7,
+      "train_samples_per_second": 294
+    }
+  },
+  "tests/test_examples.py::MultiCardCausalLanguageModelingLnExampleTester::test_run_lora_clm_llama-7b_multi_card": {
+    "gaudi2": {
+      "perplexity": 2.83,
+      "train_runtime": 249,
+      "train_samples_per_second": 165
+    }
+  },
+  "tests/test_examples.py::MultiCardCausalLanguageModelingLoRACPExampleTester::test_run_lora_clm_llama-7b_deepspeed": {
+    "gaudi2": {
+      "perplexity": 2.8889,
+      "train_runtime": 147.3597,
+      "train_samples_per_second": 34.41
+    }
+  },
+  "tests/test_examples.py::MultiCardCausalLanguageModelingLoRAFP8ExampleTester::test_run_lora_clm_llama-7b_multi_card": {
+    "gaudi2": {
+      "perplexity": 2.3692,
+      "train_runtime": 411.9935,
+      "train_samples_per_second": 232.439
+    }
+  },
+  "tests/test_examples.py::MultiCardCausalLanguageModelingPTuningExampleTester::test_run_prompt_tuning_clm_llama-7b_multi_card": {
+    "gaudi2": {
+      "perplexity": 1.047,
+      "train_runtime": 18.7,
+      "train_samples_per_second": 63.161
+    }
+  },
+  "tests/test_examples.py::MultiCardCausalLanguageModelingPrefixTuningExampleTester::test_run_prompt_tuning_clm_llama-7b_multi_card": {
+    "gaudi2": {
+      "perplexity": 1.172,
+      "train_runtime": 16.1,
+      "train_samples_per_second": 63.249
+    }
+  },
+  "tests/test_examples.py::MultiCardCausalLanguageModelingPromptTuningExampleTester::test_run_prompt_tuning_clm_llama-7b_multi_card": {
+    "gaudi2": {
+      "perplexity": 1.224,
+      "train_runtime": 16.5,
+      "train_samples_per_second": 63.161
+    }
+  },
+  "tests/test_examples.py::MultiCardCausalLanguageModelingVeraExampleTester::test_run_lora_clm_llama-7b_multi_card": {
+    "gaudi2": {
+      "perplexity": 9.064502567217577,
+      "train_runtime": 312.9258,
+      "train_samples_per_second": 127.305
+    }
+  },
+  "tests/test_examples.py::MultiCardDPOExampleTester::test_dpo_llama-7b_multi_card": {
+    "gaudi2": {
+      "train_runtime": 234.6471,
+      "train_samples_per_second": 13.499
+    }
+  },
+  "tests/test_examples.py::MultiCardImageClassificationExampleTester::test_run_image_classification_swin-base-patch4-window7-224-in22k_multi_card": {
+    "gaudi1": {
+      "eval_accuracy": 0.9819,
+      "train_runtime": 117.6424,
+      "train_samples_per_second": 1683.344
+    },
+    "gaudi2": {
+      "eval_accuracy": 0.9821,
+      "train_runtime": 62.9986,
+      "train_samples_per_second": 6202.525
+    }
+  },
+  "tests/test_examples.py::MultiCardImageClassificationExampleTester::test_run_image_classification_vit-base-patch16-224-in21k_multi_card": {
+    "gaudi1": {
+      "eval_accuracy": 0.9803,
+      "train_runtime": 59.972,
+      "train_samples_per_second": 2508.955
+    },
+    "gaudi2": {
+      "eval_accuracy": 0.9679,
+      "train_runtime": 23.99,
+      "train_samples_per_second": 6718.643
+    }
+  },
+  "tests/test_examples.py::MultiCardImageToTextModelingLoRAExampleTester::test_run_image2text_lora_finetune_Llama-3.2-11B-Vision-Instruct_multi_card": {
+    "gaudi2": {
+      "eval_accuracy": 0.6,
+      "train_runtime": 350,
+      "train_samples_per_second": 20.48
+    }
+  },
+  "tests/test_examples.py::MultiCardImageToTextModelingLoRAExampleTester::test_run_image2text_lora_finetune_idefics2-8b_multi_card": {
+    "gaudi2": {
+      "eval_accuracy": 0.6,
+      "train_runtime": 286,
+      "train_samples_per_second": 11.8
+    }
+  },
+  "tests/test_examples.py::MultiCardImageToTextModelingLoRAExampleTester::test_run_image2text_lora_finetune_llava-1.5-7b-hf_multi_card": {
+    "gaudi2": {
+      "eval_accuracy": 0.2122,
+      "train_runtime": 118.5782,
+      "train_samples_per_second": 25.146
+    }
+  },
+  "tests/test_examples.py::MultiCardMaskedLanguageModelingExampleTester::test_run_mlm_roberta-large_multi_card": {
+    "gaudi1": {
+      "perplexity": 2.7851,
+      "train_runtime": 75.0033,
+      "train_samples_per_second": 217.752
+    },
+    "gaudi2": {
+      "perplexity": 2.829522488584474,
+      "train_runtime": 22.7101,
+      "train_samples_per_second": 1056.875
+    }
+  },
+  "tests/test_examples.py::MultiCardPPOExampleTester::test_ppo_llama-7b_multi_card": {
+    "gaudi2": {
+      "train_runtime": 62,
+      "train_samples_per_second": 0.5
+    }
+  },
+  "tests/test_examples.py::MultiCardProteinFoldingClassificationTester::test_run_sequence_classification_protst-esm1b-for-sequential-classification_multi_card": {
+    "gaudi2": {
+      "eval_accuracy": 0.5436668594563332,
+      "train_runtime": 38.9504,
+      "train_samples_per_second": 768.648
+    }
+  },
+  "tests/test_examples.py::MultiCardQuestionAnsweringExampleTester::test_run_qa_roberta-large_multi_card": {
+    "gaudi1": {
+      "eval_f1": 94.2867,
+      "train_runtime": 304.9084,
+      "train_samples_per_second": 366.177
+    },
+    "gaudi2": {
+      "eval_f1": 94.09,
+      "train_runtime": 79.333,
+      "train_samples_per_second": 2138.366
+    }
+  },
+  "tests/test_examples.py::MultiCardRewardExampleTester::test_reward_modeling_llama-7b_multi_card": {
+    "gaudi2": {
+      "train_runtime": 250,
+      "train_samples_per_second": 1.6
+    }
+  },
+  "tests/test_examples.py::MultiCardSFTChatExampleTester::test_sft_Qwen2-7B_multi_card": {
+    "gaudi2": {
+      "train_runtime": 423.995,
+      "train_samples_per_second": 7.342
+    }
+  },
+  "tests/test_examples.py::MultiCardSFTChatPeftExampleTester::test_sft_Qwen2-7B_multi_card": {
+    "gaudi2": {
+      "train_runtime": 410,
+      "train_samples_per_second": 120
+    }
+  },
+  "tests/test_examples.py::MultiCardSFTExampleTester::test_sft_llama-7b_multi_card": {
+    "gaudi2": {
+      "train_runtime": 206,
+      "train_samples_per_second": 51.54
+    }
+  },
+  "tests/test_examples.py::MultiCardSeq2SeqSpeechRecognitionExampleTester::test_run_speech_recognition_seq2seq_whisper-small_multi_card": {
+    "gaudi1": {
+      "eval_samples_per_second": 6.851,
+      "eval_wer": 2.1133,
+      "train_runtime": 551.3249,
+      "train_samples_per_second": 145.59
+    },
+    "gaudi2": {
+      "eval_samples_per_second": 31.0,
+      "eval_wer": 0.4693843594009983,
+      "train_runtime": 380.0,
+      "train_samples_per_second": 218.0
+    }
+  },
+  "tests/test_examples.py::MultiCardSpeechRecognitionExampleTester::test_run_speech_recognition_ctc_wav2vec2-large-lv60_multi_card": {
+    "gaudi1": {
+      "eval_samples_per_second": 54.189,
+      "eval_wer": 0.0496,
+      "train_runtime": 984.3022,
+      "train_samples_per_second": 63.043
+    },
+    "gaudi2": {
+      "eval_samples_per_second": 196.665,
+      "eval_wer": 0.1109,
+      "train_runtime": 308.8036,
+      "train_samples_per_second": 225.572
+    }
+  },
+  "tests/test_examples.py::MultiCardTextClassificationExampleTester::test_run_glue_bert-large-uncased-whole-word-masking_multi_card": {
+    "gaudi1": {
+      "eval_f1": 0.8897,
+      "train_runtime": 65.644,
+      "train_samples_per_second": 919.623
+    },
+    "gaudi2": {
+      "eval_f1": 0.8452579034941764,
+      "train_runtime": 31.445,
+      "train_samples_per_second": 2845.068
+    }
+  },
+  "tests/test_examples.py::QuestionAnsweringExampleTester::test_run_qa_roberta-large_single_card": {
+    "gaudi1": {
+      "eval_f1": 94.2959,
+      "train_runtime": 1771.3319,
+      "train_samples_per_second": 50.815
+    },
+    "gaudi2": {
+      "eval_f1": 94.5886,
+      "train_runtime": 361.4789,
+      "train_samples_per_second": 266.47
+    }
+  },
+  "tests/test_examples.py::TextClassificationExampleTester::test_run_glue_bert-large-uncased-whole-word-masking_single_card": {
+    "gaudi1": {
+      "eval_f1": 0.9022,
+      "train_runtime": 90.3943,
+      "train_samples_per_second": 172.792
+    },
+    "gaudi2": {
+      "eval_f1": 0.867,
+      "train_runtime": 33.2909,
+      "train_samples_per_second": 1100.598
+    }
+  }
+}
\ No newline at end of file
diff --git a/tests/baselines/CodeLlama_13b_Instruct_hf.json b/tests/configs/examples/CodeLlama_13b_Instruct_hf.json
similarity index 100%
rename from tests/baselines/CodeLlama_13b_Instruct_hf.json
rename to tests/configs/examples/CodeLlama_13b_Instruct_hf.json
diff --git a/tests/baselines/LlamaGuard_7b.json b/tests/configs/examples/LlamaGuard_7b.json
similarity index 100%
rename from tests/baselines/LlamaGuard_7b.json
rename to tests/configs/examples/LlamaGuard_7b.json
diff --git a/tests/baselines/Llama_3_1_8B.json b/tests/configs/examples/Llama_3_1_8B.json
similarity index 100%
rename from tests/baselines/Llama_3_1_8B.json
rename to tests/configs/examples/Llama_3_1_8B.json
diff --git a/tests/baselines/Llama_3_2_11B_Vision_Instruct.json b/tests/configs/examples/Llama_3_2_11B_Vision_Instruct.json
similarity index 100%
rename from tests/baselines/Llama_3_2_11B_Vision_Instruct.json
rename to tests/configs/examples/Llama_3_2_11B_Vision_Instruct.json
diff --git a/tests/baselines/Qwen2_72B.json b/tests/configs/examples/Qwen2_72B.json
similarity index 100%
rename from tests/baselines/Qwen2_72B.json
rename to tests/configs/examples/Qwen2_72B.json
diff --git a/tests/baselines/Qwen2_7B.json b/tests/configs/examples/Qwen2_7B.json
similarity index 100%
rename from tests/baselines/Qwen2_7B.json
rename to tests/configs/examples/Qwen2_7B.json
diff --git a/tests/baselines/albert_large_v2.json b/tests/configs/examples/albert_large_v2.json
similarity index 99%
rename from tests/baselines/albert_large_v2.json
rename to tests/configs/examples/albert_large_v2.json
index 2f13722a95..1c4b9a945a 100644
--- a/tests/baselines/albert_large_v2.json
+++ b/tests/configs/examples/albert_large_v2.json
@@ -1,5 +1,5 @@
 {
-    "gaudi": {
+    "gaudi1": {
         "squad": {
             "num_train_epochs": 2,
             "eval_batch_size": 4,
@@ -59,4 +59,4 @@
             }
         }
     }
-}
\ No newline at end of file
+}
diff --git a/tests/baselines/albert_xxlarge_v1.json b/tests/configs/examples/albert_xxlarge_v1.json
similarity index 99%
rename from tests/baselines/albert_xxlarge_v1.json
rename to tests/configs/examples/albert_xxlarge_v1.json
index 30f4fca526..bed9591e40 100644
--- a/tests/baselines/albert_xxlarge_v1.json
+++ b/tests/configs/examples/albert_xxlarge_v1.json
@@ -1,5 +1,5 @@
 {
-    "gaudi": {
+    "gaudi1": {
         "squad": {
             "num_train_epochs": 1,
             "eval_batch_size": 2,
diff --git a/tests/baselines/ast_finetuned_speech_commands_v2.json b/tests/configs/examples/ast_finetuned_speech_commands_v2.json
similarity index 100%
rename from tests/baselines/ast_finetuned_speech_commands_v2.json
rename to tests/configs/examples/ast_finetuned_speech_commands_v2.json
diff --git a/tests/baselines/bert_base_uncased.json b/tests/configs/examples/bert_base_uncased.json
similarity index 100%
rename from tests/baselines/bert_base_uncased.json
rename to tests/configs/examples/bert_base_uncased.json
diff --git a/tests/baselines/bert_large_uncased_whole_word_masking.json b/tests/configs/examples/bert_large_uncased_whole_word_masking.json
similarity index 99%
rename from tests/baselines/bert_large_uncased_whole_word_masking.json
rename to tests/configs/examples/bert_large_uncased_whole_word_masking.json
index 605e719faf..e90e142262 100755
--- a/tests/baselines/bert_large_uncased_whole_word_masking.json
+++ b/tests/configs/examples/bert_large_uncased_whole_word_masking.json
@@ -1,5 +1,5 @@
 {
-    "gaudi": {
+    "gaudi1": {
         "squad": {
             "num_train_epochs": 1,
             "eval_batch_size": 8,
diff --git a/tests/baselines/bloom_7b1.json b/tests/configs/examples/bloom_7b1.json
similarity index 97%
rename from tests/baselines/bloom_7b1.json
rename to tests/configs/examples/bloom_7b1.json
index 37251e8651..7b71b3a62f 100644
--- a/tests/baselines/bloom_7b1.json
+++ b/tests/configs/examples/bloom_7b1.json
@@ -1,5 +1,5 @@
 {
-    "gaudi": {
+    "gaudi1": {
         "wikitext": {
             "num_train_epochs": 3,
             "eval_batch_size": 4,
diff --git a/tests/baselines/bridgetower_large_itm_mlm_itc.json b/tests/configs/examples/bridgetower_large_itm_mlm_itc.json
similarity index 100%
rename from tests/baselines/bridgetower_large_itm_mlm_itc.json
rename to tests/configs/examples/bridgetower_large_itm_mlm_itc.json
diff --git a/tests/baselines/chatglm3_6b.json b/tests/configs/examples/chatglm3_6b.json
similarity index 100%
rename from tests/baselines/chatglm3_6b.json
rename to tests/configs/examples/chatglm3_6b.json
diff --git a/tests/baselines/clip_roberta.json b/tests/configs/examples/clip_roberta.json
similarity index 99%
rename from tests/baselines/clip_roberta.json
rename to tests/configs/examples/clip_roberta.json
index 0c2dfec435..d6f6f9bd89 100755
--- a/tests/baselines/clip_roberta.json
+++ b/tests/configs/examples/clip_roberta.json
@@ -1,5 +1,5 @@
 {
-    "gaudi": {
+    "gaudi1": {
         "ydshieh/coco_dataset_script": {
             "num_train_epochs": 1,
             "eval_batch_size": 64,
diff --git a/tests/baselines/distilbert_base_uncased.json b/tests/configs/examples/distilbert_base_uncased.json
similarity index 99%
rename from tests/baselines/distilbert_base_uncased.json
rename to tests/configs/examples/distilbert_base_uncased.json
index 8678342e7b..a53c764cc0 100644
--- a/tests/baselines/distilbert_base_uncased.json
+++ b/tests/configs/examples/distilbert_base_uncased.json
@@ -1,5 +1,5 @@
 {
-    "gaudi": {
+    "gaudi1": {
         "squad": {
             "num_train_epochs": 1,
             "eval_batch_size": 8,
diff --git a/tests/baselines/falcon_40b.json b/tests/configs/examples/falcon_40b.json
similarity index 100%
rename from tests/baselines/falcon_40b.json
rename to tests/configs/examples/falcon_40b.json
diff --git a/tests/baselines/flan_t5_xxl.json b/tests/configs/examples/flan_t5_xxl.json
similarity index 100%
rename from tests/baselines/flan_t5_xxl.json
rename to tests/configs/examples/flan_t5_xxl.json
diff --git a/tests/baselines/gemma_2b_it.json b/tests/configs/examples/gemma_2b_it.json
similarity index 100%
rename from tests/baselines/gemma_2b_it.json
rename to tests/configs/examples/gemma_2b_it.json
diff --git a/tests/baselines/gemma_2b_it_eager.json b/tests/configs/examples/gemma_2b_it_eager.json
similarity index 100%
rename from tests/baselines/gemma_2b_it_eager.json
rename to tests/configs/examples/gemma_2b_it_eager.json
diff --git a/tests/baselines/gpt2.json b/tests/configs/examples/gpt2.json
similarity index 99%
rename from tests/baselines/gpt2.json
rename to tests/configs/examples/gpt2.json
index f293e9325c..4c9e89344e 100644
--- a/tests/baselines/gpt2.json
+++ b/tests/configs/examples/gpt2.json
@@ -1,5 +1,5 @@
 {
-    "gaudi": {
+    "gaudi1": {
         "wikitext": {
             "num_train_epochs": 2,
             "eval_batch_size": 4,
diff --git a/tests/baselines/gpt2_xl.json b/tests/configs/examples/gpt2_xl.json
similarity index 98%
rename from tests/baselines/gpt2_xl.json
rename to tests/configs/examples/gpt2_xl.json
index 68651d16e3..1235d46bfe 100644
--- a/tests/baselines/gpt2_xl.json
+++ b/tests/configs/examples/gpt2_xl.json
@@ -1,5 +1,5 @@
 {
-    "gaudi": {
+    "gaudi1": {
         "wikitext": {
             "num_train_epochs": 2,
             "eval_batch_size": 4,
diff --git a/tests/baselines/gpt_neox_20b.json b/tests/configs/examples/gpt_neox_20b.json
similarity index 100%
rename from tests/baselines/gpt_neox_20b.json
rename to tests/configs/examples/gpt_neox_20b.json
diff --git a/tests/baselines/idefics2_8b.json b/tests/configs/examples/idefics2_8b.json
similarity index 100%
rename from tests/baselines/idefics2_8b.json
rename to tests/configs/examples/idefics2_8b.json
diff --git a/tests/baselines/llama_7b.json b/tests/configs/examples/llama_7b.json
similarity index 99%
rename from tests/baselines/llama_7b.json
rename to tests/configs/examples/llama_7b.json
index dcfd6d3807..7ce34edaee 100644
--- a/tests/baselines/llama_7b.json
+++ b/tests/configs/examples/llama_7b.json
@@ -1,5 +1,5 @@
 {
-    "gaudi": {
+    "gaudi1": {
         "databricks/databricks-dolly-15k": {
             "num_train_epochs": 1,
             "eval_batch_size": 2,
diff --git a/tests/baselines/llava_1_5_7b_hf.json b/tests/configs/examples/llava_1_5_7b_hf.json
similarity index 100%
rename from tests/baselines/llava_1_5_7b_hf.json
rename to tests/configs/examples/llava_1_5_7b_hf.json
diff --git a/tests/baselines/protst_esm1b_for_sequential_classification.json b/tests/configs/examples/protst_esm1b_for_sequential_classification.json
similarity index 100%
rename from tests/baselines/protst_esm1b_for_sequential_classification.json
rename to tests/configs/examples/protst_esm1b_for_sequential_classification.json
diff --git a/tests/baselines/roberta_base.json b/tests/configs/examples/roberta_base.json
similarity index 99%
rename from tests/baselines/roberta_base.json
rename to tests/configs/examples/roberta_base.json
index 1c196fce1b..affe106e5f 100644
--- a/tests/baselines/roberta_base.json
+++ b/tests/configs/examples/roberta_base.json
@@ -1,5 +1,5 @@
 {
-    "gaudi": {
+    "gaudi1": {
         "squad": {
             "num_train_epochs": 1,
             "eval_batch_size": 8,
diff --git a/tests/baselines/roberta_large.json b/tests/configs/examples/roberta_large.json
similarity index 99%
rename from tests/baselines/roberta_large.json
rename to tests/configs/examples/roberta_large.json
index 4d7233e089..111dfa4533 100755
--- a/tests/baselines/roberta_large.json
+++ b/tests/configs/examples/roberta_large.json
@@ -1,5 +1,5 @@
 {
-    "gaudi": {
+    "gaudi1": {
         "squad": {
             "num_train_epochs": 1,
             "eval_batch_size": 8,
diff --git a/tests/baselines/swin_base_patch4_window7_224_in22k.json b/tests/configs/examples/swin_base_patch4_window7_224_in22k.json
similarity index 99%
rename from tests/baselines/swin_base_patch4_window7_224_in22k.json
rename to tests/configs/examples/swin_base_patch4_window7_224_in22k.json
index 8e0a5c40c3..27efbd30a2 100644
--- a/tests/baselines/swin_base_patch4_window7_224_in22k.json
+++ b/tests/configs/examples/swin_base_patch4_window7_224_in22k.json
@@ -1,5 +1,5 @@
 {
-    "gaudi": {
+    "gaudi1": {
         "cifar10": {
             "num_train_epochs": 1,
             "eval_batch_size": 64,
diff --git a/tests/baselines/t5_small.json b/tests/configs/examples/t5_small.json
similarity index 99%
rename from tests/baselines/t5_small.json
rename to tests/configs/examples/t5_small.json
index 31f9c80ef6..b6d058a510 100644
--- a/tests/baselines/t5_small.json
+++ b/tests/configs/examples/t5_small.json
@@ -1,5 +1,5 @@
 {
-    "gaudi": {
+    "gaudi1": {
         "cnn_dailymail": {
             "num_train_epochs": 1,
             "eval_batch_size": 4,
diff --git a/tests/baselines/vit_base_patch16_224_in21k.json b/tests/configs/examples/vit_base_patch16_224_in21k.json
similarity index 99%
rename from tests/baselines/vit_base_patch16_224_in21k.json
rename to tests/configs/examples/vit_base_patch16_224_in21k.json
index 7b27afd29a..679640cf64 100644
--- a/tests/baselines/vit_base_patch16_224_in21k.json
+++ b/tests/configs/examples/vit_base_patch16_224_in21k.json
@@ -1,5 +1,5 @@
 {
-    "gaudi": {
+    "gaudi1": {
         "cifar10": {
             "num_train_epochs": 1,
             "eval_batch_size": 64,
diff --git a/tests/baselines/wav2vec2_base.json b/tests/configs/examples/wav2vec2_base.json
similarity index 99%
rename from tests/baselines/wav2vec2_base.json
rename to tests/configs/examples/wav2vec2_base.json
index b187e02d51..a4f76a5a39 100644
--- a/tests/baselines/wav2vec2_base.json
+++ b/tests/configs/examples/wav2vec2_base.json
@@ -1,5 +1,5 @@
 {
-    "gaudi": {
+    "gaudi1": {
         "common_language": {
             "num_train_epochs": 10,
             "eval_batch_size": 64,
diff --git a/tests/baselines/wav2vec2_large_lv60.json b/tests/configs/examples/wav2vec2_large_lv60.json
similarity index 99%
rename from tests/baselines/wav2vec2_large_lv60.json
rename to tests/configs/examples/wav2vec2_large_lv60.json
index 920239618b..862122ebb0 100644
--- a/tests/baselines/wav2vec2_large_lv60.json
+++ b/tests/configs/examples/wav2vec2_large_lv60.json
@@ -1,5 +1,5 @@
 {
-    "gaudi": {
+    "gaudi1": {
         "regisss/librispeech_asr_for_optimum_habana_ci": {
             "num_train_epochs": 2,
             "eval_batch_size": 8,
diff --git a/tests/baselines/whisper_small.json b/tests/configs/examples/whisper_small.json
similarity index 99%
rename from tests/baselines/whisper_small.json
rename to tests/configs/examples/whisper_small.json
index 055d321152..fac096950e 100644
--- a/tests/baselines/whisper_small.json
+++ b/tests/configs/examples/whisper_small.json
@@ -1,5 +1,5 @@
 {
-    "gaudi": {
+    "gaudi1": {
         "mozilla-foundation/common_voice_11_0": {
             "num_train_epochs": 10,
             "eval_batch_size": 2,
diff --git a/tests/test_examples.py b/tests/test_examples.py
index 578ba7825e..eabfc2bc57 100644
--- a/tests/test_examples.py
+++ b/tests/test_examples.py
@@ -14,7 +14,10 @@
 # limitations under the License.
 
 import json
+import logging
+import operator
 import os
+import pytest
 import re
 import subprocess
 from distutils.util import strtobool
@@ -54,7 +57,7 @@
 )
 
 
-BASELINE_DIRECTORY = Path(__file__).parent.resolve() / Path("baselines")
+CONFIG_DIRECTORY = Path(__file__).parent.resolve() / Path("configs") / Path("examples")
 # Models should reach at least 99% of their baseline accuracy
 ACCURACY_PERF_FACTOR = 0.99
 # Trainings/Evaluations should last at most 5% longer than the baseline
@@ -62,7 +65,7 @@
 
 
 IS_GAUDI2 = bool("gaudi2" == OH_DEVICE_CONTEXT)
-
+IS_GAUDI1 = bool("gaudi1" == OH_DEVICE_CONTEXT)
 
 def _get_supported_models_for_script(
     models_to_test: Dict[str, List[Tuple[str]]],
@@ -454,29 +457,28 @@ def test(self):
 
             self._install_requirements(example_script.parent / "requirements.txt")
 
-            # collect baseline from <model_name>_eager.json if eager_mode is True
+            # collect test_config from <model_name>_eager.json if eager_mode is True
             if self.EAGER_MODE:
-                baseline_name = model_name.split("/")[-1].replace("-", "_").replace(".", "_") + "_eager"
+                config_name = model_name.split("/")[-1].replace("-", "_").replace(".", "_") + "_eager"
             else:
-                baseline_name = model_name.split("/")[-1].replace("-", "_").replace(".", "_")
+                config_name = model_name.split("/")[-1].replace("-", "_").replace(".", "_")
 
-            path_to_baseline = BASELINE_DIRECTORY / Path(baseline_name).with_suffix(".json")
+            path_to_config = CONFIG_DIRECTORY / Path(config_name).with_suffix(".json")
 
-            with path_to_baseline.open("r") as json_file:
-                device = "gaudi2" if IS_GAUDI2 else "gaudi"
-                baseline = json.load(json_file)[device]
+            with path_to_config.open("r") as json_file:
+                test_config = json.load(json_file)[OH_DEVICE_CONTEXT]
                 if isinstance(self.TASK_NAME, list):
                     for key in self.TASK_NAME:
-                        if key in baseline:
-                            baseline = baseline[key]
+                        if key in test_config:
+                            test_config = test_config[key]
                             break
-                    if "num_train_epochs" not in baseline:
+                    if "num_train_epochs" not in test_config:
                         raise ValueError(
-                            f"Couldn't find a baseline associated to any of these tasks: {self.TASK_NAME}."
+                            f"Couldn't find a test config associated to any of these tasks: {self.TASK_NAME}."
                         )
                     self.TASK_NAME = key
                 else:
-                    baseline = baseline[self.TASK_NAME]
+                    test_config = test_config[self.TASK_NAME]
 
             distribution = "single_card"
             if multi_card:
@@ -507,7 +509,7 @@ def test(self):
             if fp8 and "llama" in model_name:
                 env_variables["PT_HPU_AUTOCAST_LOWER_PRECISION_OPS_LIST"] = str(example_script.parent / "ops_bf16.txt")
 
-            extra_command_line_arguments = baseline.get("distribution").get(distribution).get("extra_arguments", [])
+            extra_command_line_arguments = test_config.get("distribution").get(distribution).get("extra_arguments", [])
 
             if self.EAGER_MODE:
                 env_variables["PT_HPU_LAZY_MODE"] = "0"
@@ -569,10 +571,10 @@ def test(self):
                     gaudi_config_name,
                     tmp_dir,
                     task=self.TASK_NAME,
-                    lr=baseline.get("distribution").get(distribution).get("learning_rate"),
-                    train_batch_size=baseline.get("distribution").get(distribution).get("train_batch_size"),
-                    eval_batch_size=baseline.get("eval_batch_size"),
-                    num_epochs=baseline.get("num_train_epochs"),
+                    lr=test_config.get("distribution").get(distribution).get("learning_rate"),
+                    train_batch_size=test_config.get("distribution").get(distribution).get("train_batch_size"),
+                    eval_batch_size=test_config.get("eval_batch_size"),
+                    num_epochs=test_config.get("num_train_epochs"),
                     extra_command_line_arguments=extra_command_line_arguments,
                 )
                 print(f"\n\nCommand to test: {' '.join(cmd_line[:])}\n")
@@ -585,7 +587,7 @@ def test(self):
                 with open(Path(tmp_dir) / "all_results.json") as fp:
                     results = json.load(fp)
                 # Ensure performance requirements (accuracy, training time) are met
-                self.assert_no_regression(results, baseline.get("distribution").get(distribution), model_name)
+                self.assert_no_regression(results, test_config.get("distribution").get(distribution).get("metrics"), model_name)
 
             # TODO: is a cleanup of the dataset cache needed?
             # self._cleanup_dataset_cache()
@@ -612,17 +614,24 @@ class ExampleTesterBase(TestCase):
     DATASET_PARAMETER_NAME = "dataset_name"
     DATASET_NAME = None
     REGRESSION_METRICS = {
-        "eval_f1": (TestCase.assertGreaterEqual, ACCURACY_PERF_FACTOR),
-        "eval_accuracy": (TestCase.assertGreaterEqual, ACCURACY_PERF_FACTOR),
-        "perplexity": (TestCase.assertLessEqual, 2 - ACCURACY_PERF_FACTOR),
-        "eval_rougeLsum": (TestCase.assertGreaterEqual, ACCURACY_PERF_FACTOR),
-        "train_runtime": (TestCase.assertLessEqual, TIME_PERF_FACTOR),
-        "eval_wer": (TestCase.assertLessEqual, 2 - ACCURACY_PERF_FACTOR),
-        "train_samples_per_second": (TestCase.assertGreaterEqual, 2 - TIME_PERF_FACTOR),
-        "eval_samples_per_second": (TestCase.assertGreaterEqual, 2 - TIME_PERF_FACTOR),
+        "eval_f1": (operator.ge, ACCURACY_PERF_FACTOR),
+        "eval_accuracy": (operator.ge, ACCURACY_PERF_FACTOR),
+        "perplexity": (operator.le, 2 - ACCURACY_PERF_FACTOR),
+        "eval_rougeLsum": (operator.ge, ACCURACY_PERF_FACTOR),
+        "train_runtime": (operator.le, TIME_PERF_FACTOR),
+        "eval_wer": (operator.le, 2 - ACCURACY_PERF_FACTOR),
+        "train_samples_per_second": (operator.ge, 2 - TIME_PERF_FACTOR),
+        "eval_samples_per_second": (operator.ge, 2 - TIME_PERF_FACTOR),
     }
     EAGER_MODE = False
 
+    @pytest.fixture(autouse=True)
+    def _use_(self, baseline):
+        """
+        https://docs.pytest.org/en/stable/how-to/unittest.html#using-autouse-fixtures-and-accessing-other-fixtures
+        """
+        self.baseline = baseline
+
     def _create_command_line(
         self,
         multi_card: bool,
@@ -717,7 +726,7 @@ def _install_requirements(self, requirements_filename: Union[str, os.PathLike]):
         return_code = p.wait()
         self.assertEqual(return_code, 0)
 
-    def assert_no_regression(self, results: Dict, baseline: Dict, model_name: str):
+    def assert_no_regression(self, results: Dict, metrics: list, model_name: str):
         """
         Assert whether all possible performance requirements are met.
         Attributes:
@@ -725,12 +734,10 @@ def assert_no_regression(self, results: Dict, baseline: Dict, model_name: str):
             baseline (Dict): baseline to assert whether or not there is regression
         """
         # Gather all the metrics to assess
-        metrics_to_assess = []
-        for metric_name in self.REGRESSION_METRICS.keys():
-            if metric_name in baseline and metric_name in results:
-                metrics_to_assess.append(metric_name)
-        # There is no accuracy metric for `run_clip.py`, `run_bridgetower.py` and BLOOM
+        metrics_to_assess = list(set(self.REGRESSION_METRICS.keys()) & set(metrics) & set(results.keys()))
         min_number_metrics = 3
+
+        # There is no accuracy metric for `run_clip.py`, `run_bridgetower.py` and BLOOM
         if (
             self.EXAMPLE_NAME in ["run_clip", "run_bridgetower", "sft", "dpo", "ppo", "reward_modeling"]
             or "bloom" in model_name
@@ -745,25 +752,26 @@ def assert_no_regression(self, results: Dict, baseline: Dict, model_name: str):
             (
                 f"{len(metrics_to_assess)} asserted metric(s) while at least 3 are expected (throughput + training"
                 f" time + accuracy). Metrics to assert: {self.REGRESSION_METRICS.keys()}. Metrics received:"
-                f" {baseline.keys()}"
+                f" {metrics}"
             ),
         )
 
-        # Message to display if one test fails
-        # This enables to show all the results and baselines even if one test fails before others
-        failure_message = "\n===== Assessed metrics (measured vs thresholded baseline) =====\n"
-        for metric_name in metrics_to_assess:
-            failure_message += f"{metric_name}: {results[metric_name]} vs {self.REGRESSION_METRICS[metric_name][1] * baseline[metric_name]}\n"
-
         # Assess metrics
+        passed = True
         for metric_name in metrics_to_assess:
-            assert_function, threshold_factor = self.REGRESSION_METRICS[metric_name]
-            assert_function(
-                self,
-                results[metric_name],
-                threshold_factor * baseline[metric_name],
-                msg=f"for metric {metric_name}. {failure_message}",
-            )
+            fn, threshold = self.REGRESSION_METRICS[metric_name]
+            def check(actual, ref):
+                check.msg = f"{metric_name}: {fn.__name__}({actual}, {threshold} * {ref})\n"
+                return fn(actual, threshold * ref)
+            check.msg = ""
+
+            try:
+                self.baseline.assertRef(compare=check, context=[OH_DEVICE_CONTEXT], **{metric_name:results[metric_name]})
+            except Exception as e:
+                logging.getLogger().error(check.msg)
+                passed = False
+
+        assert passed, f"One or more metrics failed"
 
 
 class TextClassificationExampleTester(ExampleTesterBase, metaclass=ExampleTestMeta, example_name="run_glue"):