diff --git a/tests/baselines/fixture/tests/test_examples.json b/tests/baselines/fixture/tests/test_examples.json new file mode 100644 index 0000000000..d1820727f8 --- /dev/null +++ b/tests/baselines/fixture/tests/test_examples.json @@ -0,0 +1,440 @@ +{ + "tests/test_examples.py::CausalLanguageModelingExampleTester::test_run_clm_gemma-2b-it_single_card": { + "gaudi2": { + "perplexity": 26.39, + "train_runtime": 356.07, + "train_samples_per_second": 14.06 + } + }, + "tests/test_examples.py::CausalLanguageModelingLORAExampleTester::test_run_lora_clm_llama-7b_single_card": { + "gaudi1": { + "perplexity": 3.9168, + "train_runtime": 132.665, + "train_samples_per_second": 2.295 + }, + "gaudi2": { + "perplexity": 3.8436, + "train_runtime": 113.9713, + "train_samples_per_second": 18.428 + } + }, + "tests/test_examples.py::DeepSpeedTextClassificationExampleTester::test_run_glue_LlamaGuard-7b_deepspeed": { + "gaudi2": { + "eval_f1": 0.8873483535528596, + "train_runtime": 62.4539, + "train_samples_per_second": 342.169 + } + }, + "tests/test_examples.py::DeepspeedCausalLanguageModelingExampleTester::test_run_clm_CodeLlama-13b-Instruct-hf_deepspeed": { + "gaudi2": { + "perplexity": 6.877496628184696, + "train_runtime": 542.2985, + "train_samples_per_second": 18.789 + } + }, + "tests/test_examples.py::DeepspeedCausalLanguageModelingExampleTester::test_run_clm_bloom-7b1_deepspeed": { + "gaudi1": { + "train_runtime": 1556.481, + "train_samples_per_second": 4.757 + } + }, + "tests/test_examples.py::DeepspeedCausalLanguageModelingExampleTester::test_run_clm_chatglm3-6b_deepspeed": { + "gaudi2": { + "perplexity": 16.51629, + "train_runtime": 445, + "train_samples_per_second": 18.216 + } + }, + "tests/test_examples.py::DeepspeedCausalLanguageModelingExampleTester::test_run_clm_gemma-2b-it_deepspeed": { + "gaudi2": { + "perplexity": 924.062, + "train_runtime": 75.518, + "train_samples_per_second": 81.097 + } + }, + "tests/test_examples.py::DeepspeedCausalLanguageModelingExampleTester::test_run_clm_gpt-neox-20b_deepspeed": { + "gaudi2": { + "perplexity": 8.169664686471043, + "train_runtime": 445, + "train_samples_per_second": 7.328 + } + }, + "tests/test_examples.py::DeepspeedCausalLanguageModelingExampleTester::test_run_clm_gpt2-xl_deepspeed": { + "gaudi1": { + "perplexity": 12.6744, + "train_runtime": 366.8694, + "train_samples_per_second": 16.464 + }, + "gaudi2": { + "perplexity": 13.237754028004865, + "train_runtime": 206.5775, + "train_samples_per_second": 95.539 + } + }, + "tests/test_examples.py::DeepspeedSFTExampleTester::test_sft_Qwen2-72B_deepspeed": { + "gaudi2": { + "perplexity": 3.7020898897918824, + "train_runtime": 918.8018, + "train_samples_per_second": 7.554 + } + }, + "tests/test_examples.py::DeepspeedSummarizationExampleTester::test_run_summarization_flan-t5-xxl_deepspeed": { + "gaudi2": { + "eval_rougeLsum": 29.308, + "train_runtime": 155.86, + "train_samples_per_second": 28.387 + } + }, + "tests/test_examples.py::EagerModeCausalLanguageModelingExampleTester::test_run_clm_gemma-2b-it_single_card": { + "gaudi2": { + "perplexity": 26.69, + "train_runtime": 560.8188, + "train_samples_per_second": 8.597 + } + }, + "tests/test_examples.py::ImageClassificationExampleTester::test_run_image_classification_swin-base-patch4-window7-224-in22k_single_card": { + "gaudi1": { + "eval_accuracy": 0.9871, + "train_runtime": 246.4134, + "train_samples_per_second": 212.722 + }, + "gaudi2": { + "eval_accuracy": 0.9850666666666666, + "train_runtime": 77.8934, + "train_samples_per_second": 826.766 + } + }, + "tests/test_examples.py::ImageClassificationExampleTester::test_run_image_classification_vit-base-patch16-224-in21k_single_card": { + "gaudi1": { + "eval_accuracy": 0.9812, + "train_runtime": 136.9418, + "train_samples_per_second": 359.584 + }, + "gaudi2": { + "eval_accuracy": 0.9690666666666666, + "train_runtime": 54.9734, + "train_samples_per_second": 870.272 + } + }, + "tests/test_examples.py::MultiCardAudioClassificationExampleTester::test_run_audio_classification_ast-finetuned-speech-commands-v2_multi_card": { + "gaudi2": { + "eval_accuracy": 0.1871, + "eval_samples_per_second": 2301.088, + "train_runtime": 139.9477, + "train_samples_per_second": 1955.74 + } + }, + "tests/test_examples.py::MultiCardAudioClassificationExampleTester::test_run_audio_classification_wav2vec2-base_multi_card": { + "gaudi1": { + "eval_accuracy": 0.8013, + "eval_samples_per_second": 329.12, + "train_runtime": 366.8081, + "train_samples_per_second": 716.385 + }, + "gaudi2": { + "eval_accuracy": 0.7228, + "eval_samples_per_second": 3640.021, + "train_runtime": 63.4079, + "train_samples_per_second": 2975.844 + } + }, + "tests/test_examples.py::MultiCardBridgetowerExampleTester::test_run_bridgetower_bridgetower-large-itm-mlm-itc_multi_card": { + "gaudi2": { + "train_runtime": 224.42, + "train_samples_per_second": 904.93 + } + }, + "tests/test_examples.py::MultiCardCausalLanguageModelingAdaloraExampleTester::test_run_lora_clm_llama-7b_multi_card": { + "gaudi2": { + "perplexity": 2.59, + "train_runtime": 459, + "train_samples_per_second": 107 + } + }, + "tests/test_examples.py::MultiCardCausalLanguageModelingExampleTester::test_run_clm_gemma-2b-it_multi_card": { + "gaudi2": { + "perplexity": 954.5995, + "train_runtime": 82.6617, + "train_samples_per_second": 94.524 + } + }, + "tests/test_examples.py::MultiCardCausalLanguageModelingIA3ExampleTester::test_run_lora_clm_llama-7b_multi_card": { + "gaudi2": { + "perplexity": 3.3, + "train_runtime": 262.8, + "train_samples_per_second": 161 + } + }, + "tests/test_examples.py::MultiCardCausalLanguageModelingLORAExampleTester2::test_run_lora_clm_falcon-40b_multi_card": { + "gaudi2": { + "perplexity": 1.6, + "train_runtime": 710, + "train_samples_per_second": 15.0 + } + }, + "tests/test_examples.py::MultiCardCausalLanguageModelingLORAExampleTester2::test_run_lora_clm_llama-7b_multi_card": { + "gaudi2": { + "perplexity": 2.3665, + "train_runtime": 294.5707, + "train_samples_per_second": 148.093 + } + }, + "tests/test_examples.py::MultiCardCausalLanguageModelingLORAExampleTester::test_run_lora_clm_falcon-40b_multi_card": { + "gaudi2": { + "perplexity": 4.0, + "train_runtime": 550, + "train_samples_per_second": 15.0 + } + }, + "tests/test_examples.py::MultiCardCausalLanguageModelingLORAExampleTester::test_run_lora_clm_llama-7b_multi_card": { + "gaudi1": { + "perplexity": 2.7542, + "train_runtime": 538.0159, + "train_samples_per_second": 20.397 + }, + "gaudi2": { + "perplexity": 2.3665, + "train_runtime": 294.5707, + "train_samples_per_second": 148.093 + } + }, + "tests/test_examples.py::MultiCardCausalLanguageModelingLORAFSDPCompileExampleTester::test_run_lora_clm_llama-7b_multi_card": { + "gaudi2": { + "perplexity": 2.4259, + "train_runtime": 186.2483, + "train_samples_per_second": 93.5 + } + }, + "tests/test_examples.py::MultiCardCausalLanguageModelingLlamaAdapterExampleTester::test_run_lora_clm_llama-7b_multi_card": { + "gaudi2": { + "perplexity": 5.575, + "train_runtime": 131.7, + "train_samples_per_second": 294 + } + }, + "tests/test_examples.py::MultiCardCausalLanguageModelingLnExampleTester::test_run_lora_clm_llama-7b_multi_card": { + "gaudi2": { + "perplexity": 2.83, + "train_runtime": 249, + "train_samples_per_second": 165 + } + }, + "tests/test_examples.py::MultiCardCausalLanguageModelingLoRACPExampleTester::test_run_lora_clm_llama-7b_deepspeed": { + "gaudi2": { + "perplexity": 2.8889, + "train_runtime": 147.3597, + "train_samples_per_second": 34.41 + } + }, + "tests/test_examples.py::MultiCardCausalLanguageModelingLoRAFP8ExampleTester::test_run_lora_clm_llama-7b_multi_card": { + "gaudi2": { + "perplexity": 2.3692, + "train_runtime": 411.9935, + "train_samples_per_second": 232.439 + } + }, + "tests/test_examples.py::MultiCardCausalLanguageModelingPTuningExampleTester::test_run_prompt_tuning_clm_llama-7b_multi_card": { + "gaudi2": { + "perplexity": 1.047, + "train_runtime": 18.7, + "train_samples_per_second": 63.161 + } + }, + "tests/test_examples.py::MultiCardCausalLanguageModelingPrefixTuningExampleTester::test_run_prompt_tuning_clm_llama-7b_multi_card": { + "gaudi2": { + "perplexity": 1.172, + "train_runtime": 16.1, + "train_samples_per_second": 63.249 + } + }, + "tests/test_examples.py::MultiCardCausalLanguageModelingPromptTuningExampleTester::test_run_prompt_tuning_clm_llama-7b_multi_card": { + "gaudi2": { + "perplexity": 1.224, + "train_runtime": 16.5, + "train_samples_per_second": 63.161 + } + }, + "tests/test_examples.py::MultiCardCausalLanguageModelingVeraExampleTester::test_run_lora_clm_llama-7b_multi_card": { + "gaudi2": { + "perplexity": 9.064502567217577, + "train_runtime": 312.9258, + "train_samples_per_second": 127.305 + } + }, + "tests/test_examples.py::MultiCardDPOExampleTester::test_dpo_llama-7b_multi_card": { + "gaudi2": { + "train_runtime": 234.6471, + "train_samples_per_second": 13.499 + } + }, + "tests/test_examples.py::MultiCardImageClassificationExampleTester::test_run_image_classification_swin-base-patch4-window7-224-in22k_multi_card": { + "gaudi1": { + "eval_accuracy": 0.9819, + "train_runtime": 117.6424, + "train_samples_per_second": 1683.344 + }, + "gaudi2": { + "eval_accuracy": 0.9821, + "train_runtime": 62.9986, + "train_samples_per_second": 6202.525 + } + }, + "tests/test_examples.py::MultiCardImageClassificationExampleTester::test_run_image_classification_vit-base-patch16-224-in21k_multi_card": { + "gaudi1": { + "eval_accuracy": 0.9803, + "train_runtime": 59.972, + "train_samples_per_second": 2508.955 + }, + "gaudi2": { + "eval_accuracy": 0.9679, + "train_runtime": 23.99, + "train_samples_per_second": 6718.643 + } + }, + "tests/test_examples.py::MultiCardImageToTextModelingLoRAExampleTester::test_run_image2text_lora_finetune_Llama-3.2-11B-Vision-Instruct_multi_card": { + "gaudi2": { + "eval_accuracy": 0.6, + "train_runtime": 350, + "train_samples_per_second": 20.48 + } + }, + "tests/test_examples.py::MultiCardImageToTextModelingLoRAExampleTester::test_run_image2text_lora_finetune_idefics2-8b_multi_card": { + "gaudi2": { + "eval_accuracy": 0.6, + "train_runtime": 286, + "train_samples_per_second": 11.8 + } + }, + "tests/test_examples.py::MultiCardImageToTextModelingLoRAExampleTester::test_run_image2text_lora_finetune_llava-1.5-7b-hf_multi_card": { + "gaudi2": { + "eval_accuracy": 0.2122, + "train_runtime": 118.5782, + "train_samples_per_second": 25.146 + } + }, + "tests/test_examples.py::MultiCardMaskedLanguageModelingExampleTester::test_run_mlm_roberta-large_multi_card": { + "gaudi1": { + "perplexity": 2.7851, + "train_runtime": 75.0033, + "train_samples_per_second": 217.752 + }, + "gaudi2": { + "perplexity": 2.829522488584474, + "train_runtime": 22.7101, + "train_samples_per_second": 1056.875 + } + }, + "tests/test_examples.py::MultiCardPPOExampleTester::test_ppo_llama-7b_multi_card": { + "gaudi2": { + "train_runtime": 62, + "train_samples_per_second": 0.5 + } + }, + "tests/test_examples.py::MultiCardProteinFoldingClassificationTester::test_run_sequence_classification_protst-esm1b-for-sequential-classification_multi_card": { + "gaudi2": { + "eval_accuracy": 0.5436668594563332, + "train_runtime": 38.9504, + "train_samples_per_second": 768.648 + } + }, + "tests/test_examples.py::MultiCardQuestionAnsweringExampleTester::test_run_qa_roberta-large_multi_card": { + "gaudi1": { + "eval_f1": 94.2867, + "train_runtime": 304.9084, + "train_samples_per_second": 366.177 + }, + "gaudi2": { + "eval_f1": 94.09, + "train_runtime": 79.333, + "train_samples_per_second": 2138.366 + } + }, + "tests/test_examples.py::MultiCardRewardExampleTester::test_reward_modeling_llama-7b_multi_card": { + "gaudi2": { + "train_runtime": 250, + "train_samples_per_second": 1.6 + } + }, + "tests/test_examples.py::MultiCardSFTChatExampleTester::test_sft_Qwen2-7B_multi_card": { + "gaudi2": { + "train_runtime": 423.995, + "train_samples_per_second": 7.342 + } + }, + "tests/test_examples.py::MultiCardSFTChatPeftExampleTester::test_sft_Qwen2-7B_multi_card": { + "gaudi2": { + "train_runtime": 410, + "train_samples_per_second": 120 + } + }, + "tests/test_examples.py::MultiCardSFTExampleTester::test_sft_llama-7b_multi_card": { + "gaudi2": { + "train_runtime": 206, + "train_samples_per_second": 51.54 + } + }, + "tests/test_examples.py::MultiCardSeq2SeqSpeechRecognitionExampleTester::test_run_speech_recognition_seq2seq_whisper-small_multi_card": { + "gaudi1": { + "eval_samples_per_second": 6.851, + "eval_wer": 2.1133, + "train_runtime": 551.3249, + "train_samples_per_second": 145.59 + }, + "gaudi2": { + "eval_samples_per_second": 31.0, + "eval_wer": 0.4693843594009983, + "train_runtime": 380.0, + "train_samples_per_second": 218.0 + } + }, + "tests/test_examples.py::MultiCardSpeechRecognitionExampleTester::test_run_speech_recognition_ctc_wav2vec2-large-lv60_multi_card": { + "gaudi1": { + "eval_samples_per_second": 54.189, + "eval_wer": 0.0496, + "train_runtime": 984.3022, + "train_samples_per_second": 63.043 + }, + "gaudi2": { + "eval_samples_per_second": 196.665, + "eval_wer": 0.1109, + "train_runtime": 308.8036, + "train_samples_per_second": 225.572 + } + }, + "tests/test_examples.py::MultiCardTextClassificationExampleTester::test_run_glue_bert-large-uncased-whole-word-masking_multi_card": { + "gaudi1": { + "eval_f1": 0.8897, + "train_runtime": 65.644, + "train_samples_per_second": 919.623 + }, + "gaudi2": { + "eval_f1": 0.8452579034941764, + "train_runtime": 31.445, + "train_samples_per_second": 2845.068 + } + }, + "tests/test_examples.py::QuestionAnsweringExampleTester::test_run_qa_roberta-large_single_card": { + "gaudi1": { + "eval_f1": 94.2959, + "train_runtime": 1771.3319, + "train_samples_per_second": 50.815 + }, + "gaudi2": { + "eval_f1": 94.5886, + "train_runtime": 361.4789, + "train_samples_per_second": 266.47 + } + }, + "tests/test_examples.py::TextClassificationExampleTester::test_run_glue_bert-large-uncased-whole-word-masking_single_card": { + "gaudi1": { + "eval_f1": 0.9022, + "train_runtime": 90.3943, + "train_samples_per_second": 172.792 + }, + "gaudi2": { + "eval_f1": 0.867, + "train_runtime": 33.2909, + "train_samples_per_second": 1100.598 + } + } +} \ No newline at end of file diff --git a/tests/baselines/CodeLlama_13b_Instruct_hf.json b/tests/configs/examples/CodeLlama_13b_Instruct_hf.json similarity index 79% rename from tests/baselines/CodeLlama_13b_Instruct_hf.json rename to tests/configs/examples/CodeLlama_13b_Instruct_hf.json index 52af5445cb..d2c2aa86f9 100644 --- a/tests/baselines/CodeLlama_13b_Instruct_hf.json +++ b/tests/configs/examples/CodeLlama_13b_Instruct_hf.json @@ -7,9 +7,7 @@ "deepspeed": { "learning_rate": 5e-5, "train_batch_size": 48, - "train_runtime": 542.2985, - "train_samples_per_second": 18.789, - "perplexity": 6.877496628184696, + "metrics": ["perplexity", "train_runtime", "train_samples_per_second"], "extra_arguments": [ "--dataset_config_name wikitext-2-raw-v1", "--gradient_checkpointing", @@ -20,4 +18,4 @@ } } } -} \ No newline at end of file +} diff --git a/tests/baselines/LlamaGuard_7b.json b/tests/configs/examples/LlamaGuard_7b.json similarity index 79% rename from tests/baselines/LlamaGuard_7b.json rename to tests/configs/examples/LlamaGuard_7b.json index a1e5bc7db5..704fe64c73 100644 --- a/tests/baselines/LlamaGuard_7b.json +++ b/tests/configs/examples/LlamaGuard_7b.json @@ -7,9 +7,7 @@ "deepspeed": { "learning_rate": 3e-5, "train_batch_size": 32, - "eval_f1": 0.8873483535528596, - "train_runtime": 62.4539, - "train_samples_per_second": 342.169, + "metrics": ["eval_f1", "train_runtime", "train_samples_per_second"], "extra_arguments": [ "--max_seq_length 128", "--add_pad_token True", @@ -20,4 +18,4 @@ } } } -} \ No newline at end of file +} diff --git a/tests/baselines/Llama_3_1_8B.json b/tests/configs/examples/Llama_3_1_8B.json similarity index 88% rename from tests/baselines/Llama_3_1_8B.json rename to tests/configs/examples/Llama_3_1_8B.json index fa7b39c095..4c57db9a6b 100644 --- a/tests/baselines/Llama_3_1_8B.json +++ b/tests/configs/examples/Llama_3_1_8B.json @@ -7,9 +7,7 @@ "single_card": { "learning_rate": 3e-4, "train_batch_size": 10, - "perplexity": 2.7317, - "train_runtime": 1435.24322, - "train_samples_per_second": 13.3044, + "metrics": ["perplexity", "train_runtime", "train_samples_per_second"], "extra_arguments": [ "--bf16", "--gradient_checkpointing", @@ -32,4 +30,4 @@ } } } -} \ No newline at end of file +} diff --git a/tests/baselines/Llama_3_2_11B_Vision_Instruct.json b/tests/configs/examples/Llama_3_2_11B_Vision_Instruct.json similarity index 90% rename from tests/baselines/Llama_3_2_11B_Vision_Instruct.json rename to tests/configs/examples/Llama_3_2_11B_Vision_Instruct.json index fd90ab97f0..fd8abaccfc 100644 --- a/tests/baselines/Llama_3_2_11B_Vision_Instruct.json +++ b/tests/configs/examples/Llama_3_2_11B_Vision_Instruct.json @@ -7,9 +7,7 @@ "multi_card": { "learning_rate": 5e-5, "train_batch_size": 2, - "train_runtime": 350, - "train_samples_per_second": 20.48, - "eval_accuracy": 0.6, + "metrics": ["eval_accuracy", "train_runtime", "train_samples_per_second"], "extra_arguments": [ "--bf16", "--gradient_accumulation_steps 8", diff --git a/tests/baselines/Qwen2_72B.json b/tests/configs/examples/Qwen2_72B.json similarity index 91% rename from tests/baselines/Qwen2_72B.json rename to tests/configs/examples/Qwen2_72B.json index 5bc4923a77..848bb0238d 100644 --- a/tests/baselines/Qwen2_72B.json +++ b/tests/configs/examples/Qwen2_72B.json @@ -7,9 +7,7 @@ "deepspeed": { "learning_rate": 3e-4, "train_batch_size": 8, - "perplexity": 3.7020898897918824, - "train_runtime": 918.8018, - "train_samples_per_second": 7.554, + "metrics": ["perplexity", "train_runtime", "train_samples_per_second"], "extra_arguments": [ "--bf16 True", "--subset None", @@ -44,4 +42,4 @@ } } } -} \ No newline at end of file +} diff --git a/tests/baselines/Qwen2_7B.json b/tests/configs/examples/Qwen2_7B.json similarity index 93% rename from tests/baselines/Qwen2_7B.json rename to tests/configs/examples/Qwen2_7B.json index d98abf5e7d..23b4ea048a 100644 --- a/tests/baselines/Qwen2_7B.json +++ b/tests/configs/examples/Qwen2_7B.json @@ -7,8 +7,7 @@ "multi_card": { "learning_rate": 3e-4, "train_batch_size": 32, - "train_runtime": 410, - "train_samples_per_second": 120, + "metrics": ["train_runtime", "train_samples_per_second"], "extra_arguments": [ "--bf16 True", "--subset ''", @@ -44,8 +43,7 @@ "multi_card": { "learning_rate": 3e-4, "train_batch_size": 2, - "train_runtime": 423.995, - "train_samples_per_second": 7.342, + "metrics": ["train_runtime", "train_samples_per_second"], "extra_arguments": [ "--bf16 True", "--subset ''", @@ -71,4 +69,4 @@ } } } -} \ No newline at end of file +} diff --git a/tests/baselines/albert_large_v2.json b/tests/configs/examples/albert_large_v2.json similarity index 72% rename from tests/baselines/albert_large_v2.json rename to tests/configs/examples/albert_large_v2.json index 2f13722a95..59648caf3e 100644 --- a/tests/baselines/albert_large_v2.json +++ b/tests/configs/examples/albert_large_v2.json @@ -1,5 +1,5 @@ { - "gaudi": { + "gaudi1": { "squad": { "num_train_epochs": 2, "eval_batch_size": 4, @@ -7,9 +7,7 @@ "single_card": { "learning_rate": 6e-5, "train_batch_size": 32, - "eval_f1": 91.8679, - "train_runtime": 2900.5518, - "train_samples_per_second": 62.298, + "metrics": ["eval_f1", "train_runtime", "train_samples_per_second"], "extra_arguments": [ "--max_seq_length 384", "--use_hpu_graphs_for_inference" @@ -18,9 +16,7 @@ "multi_card": { "learning_rate": 6e-5, "train_batch_size": 32, - "eval_f1": 92.7647, - "train_runtime": 464.9893, - "train_samples_per_second": 494.936, + "metrics": ["eval_f1", "train_runtime", "train_samples_per_second"], "extra_arguments": [ "--max_seq_length 384", "--use_hpu_graphs_for_inference" @@ -37,9 +33,7 @@ "single_card": { "learning_rate": 6e-5, "train_batch_size": 128, - "eval_f1": 92.4235, - "train_runtime": 571.138, - "train_samples_per_second": 321.635, + "metrics": ["eval_f1", "train_runtime", "train_samples_per_second"], "extra_arguments": [ "--max_seq_length 384", "--use_hpu_graphs_for_inference" @@ -48,9 +42,7 @@ "multi_card": { "learning_rate": 7e-5, "train_batch_size": 128, - "eval_f1": 92.2111, - "train_runtime": 115.15, - "train_samples_per_second": 2464.403, + "metrics": ["eval_f1", "train_runtime", "train_samples_per_second"], "extra_arguments": [ "--max_seq_length 384", "--use_hpu_graphs_for_inference" @@ -59,4 +51,4 @@ } } } -} \ No newline at end of file +} diff --git a/tests/baselines/albert_xxlarge_v1.json b/tests/configs/examples/albert_xxlarge_v1.json similarity index 72% rename from tests/baselines/albert_xxlarge_v1.json rename to tests/configs/examples/albert_xxlarge_v1.json index 30f4fca526..ebda527c92 100644 --- a/tests/baselines/albert_xxlarge_v1.json +++ b/tests/configs/examples/albert_xxlarge_v1.json @@ -1,5 +1,5 @@ { - "gaudi": { + "gaudi1": { "squad": { "num_train_epochs": 1, "eval_batch_size": 2, @@ -7,9 +7,7 @@ "single_card": { "learning_rate": 1e-5, "train_batch_size": 12, - "eval_f1": 95.1334, - "train_runtime": 9474.1784, - "train_samples_per_second": 9.464, + "metrics": ["eval_f1", "train_runtime", "train_samples_per_second"], "extra_arguments": [ "--max_seq_length 384", "--use_hpu_graphs_for_inference" @@ -18,9 +16,7 @@ "multi_card": { "learning_rate": 5e-5, "train_batch_size": 12, - "eval_f1": 95.1145, - "train_runtime": 1347.7824, - "train_samples_per_second": 71.285, + "metrics": ["eval_f1", "train_runtime", "train_samples_per_second"], "extra_arguments": [ "--max_seq_length 384", "--use_hpu_graphs_for_inference" @@ -37,9 +33,7 @@ "single_card": { "learning_rate": 2e-5, "train_batch_size": 16, - "eval_f1": 95.1484, - "train_runtime": 1523.3401, - "train_samples_per_second": 58.697, + "metrics": ["eval_f1", "train_runtime", "train_samples_per_second"], "extra_arguments": [ "--max_seq_length 384", "--use_hpu_graphs_for_inference" @@ -48,9 +42,7 @@ "multi_card": { "learning_rate": 7e-5, "train_batch_size": 16, - "eval_f1": 95.1898, - "train_runtime": 243.0459, - "train_samples_per_second": 416.256, + "metrics": ["eval_f1", "train_runtime", "train_samples_per_second"], "extra_arguments": [ "--max_seq_length 384", "--use_hpu_graphs_for_inference" @@ -59,4 +51,4 @@ } } } -} \ No newline at end of file +} diff --git a/tests/baselines/ast_finetuned_speech_commands_v2.json b/tests/configs/examples/ast_finetuned_speech_commands_v2.json similarity index 83% rename from tests/baselines/ast_finetuned_speech_commands_v2.json rename to tests/configs/examples/ast_finetuned_speech_commands_v2.json index f518c92907..b9c347222b 100644 --- a/tests/baselines/ast_finetuned_speech_commands_v2.json +++ b/tests/configs/examples/ast_finetuned_speech_commands_v2.json @@ -7,10 +7,7 @@ "multi_card": { "learning_rate": 5e-4, "train_batch_size": 32, - "eval_accuracy": 0.1871, - "train_runtime": 139.9477, - "train_samples_per_second": 1955.74, - "eval_samples_per_second": 2301.088, + "metrics": ["eval_accuracy", "train_runtime", "train_samples_per_second", "eval_samples_per_second"], "extra_arguments": [ "--audio_column_name audio", "--label_column_name language", diff --git a/tests/baselines/bert_base_uncased.json b/tests/configs/examples/bert_base_uncased.json similarity index 71% rename from tests/baselines/bert_base_uncased.json rename to tests/configs/examples/bert_base_uncased.json index 18bcf59170..1960b5272a 100644 --- a/tests/baselines/bert_base_uncased.json +++ b/tests/configs/examples/bert_base_uncased.json @@ -6,9 +6,7 @@ "single_card": { "learning_rate": 5e-5, "train_batch_size": 24, - "eval_f1": 87.3749, - "train_runtime": 568.832, - "train_samples_per_second": 158.687, + "metrics": ["eval_f1", "train_runtime", "train_samples_per_second"], "extra_arguments": [ "--max_seq_length 384", "--use_hpu_graphs_for_inference" @@ -17,9 +15,7 @@ "multi_card": { "learning_rate": 2e-4, "train_batch_size": 24, - "eval_f1": 87.6017, - "train_runtime": 97.7157, - "train_samples_per_second": 1240.638, + "metrics": ["eval_f1", "train_runtime", "train_samples_per_second"], "extra_arguments": [ "--max_seq_length 384", "--use_hpu_graphs_for_inference" @@ -34,9 +30,7 @@ "single_card": { "learning_rate": 6e-5, "train_batch_size": 64, - "eval_f1": 0.8998, - "train_runtime": 31.044, - "train_samples_per_second": 558.201, + "metrics": ["eval_f1", "train_runtime", "train_samples_per_second"], "extra_arguments": [ "--max_seq_length 128", "--use_hpu_graphs_for_inference" @@ -45,9 +39,7 @@ "multi_card": { "learning_rate": 5e-4, "train_batch_size": 64, - "eval_f1": 0.8765, - "train_runtime": 28.3865, - "train_samples_per_second": 3643.715, + "metrics": ["eval_f1", "train_runtime", "train_samples_per_second"], "extra_arguments": [ "--max_seq_length 128", "--use_hpu_graphs_for_inference" @@ -55,4 +47,4 @@ } } } -} \ No newline at end of file +} diff --git a/tests/baselines/bert_large_uncased_whole_word_masking.json b/tests/configs/examples/bert_large_uncased_whole_word_masking.json similarity index 72% rename from tests/baselines/bert_large_uncased_whole_word_masking.json rename to tests/configs/examples/bert_large_uncased_whole_word_masking.json index 605e719faf..f3a0d79692 100755 --- a/tests/baselines/bert_large_uncased_whole_word_masking.json +++ b/tests/configs/examples/bert_large_uncased_whole_word_masking.json @@ -1,5 +1,5 @@ { - "gaudi": { + "gaudi1": { "squad": { "num_train_epochs": 1, "eval_batch_size": 8, @@ -7,9 +7,7 @@ "single_card": { "learning_rate": 3e-5, "train_batch_size": 24, - "eval_f1": 93.1962, - "train_runtime": 1678.3456, - "train_samples_per_second": 54.101, + "metrics": ["eval_f1", "train_runtime", "train_samples_per_second"], "extra_arguments": [ "--max_seq_length 384", "--use_hpu_graphs_for_inference" @@ -18,9 +16,7 @@ "multi_card": { "learning_rate": 7e-5, "train_batch_size": 24, - "eval_f1": 93.1869, - "train_runtime": 309.9553, - "train_samples_per_second": 398.459, + "metrics": ["eval_f1", "train_runtime", "train_samples_per_second"], "extra_arguments": [ "--max_seq_length 384", "--use_hpu_graphs_for_inference" @@ -35,9 +31,7 @@ "single_card": { "learning_rate": 3e-5, "train_batch_size": 32, - "eval_f1": 0.9022, - "train_runtime": 90.3943, - "train_samples_per_second": 172.792, + "metrics": ["eval_f1", "train_runtime", "train_samples_per_second"], "extra_arguments": [ "--max_seq_length 128", "--use_hpu_graphs_for_inference" @@ -46,9 +40,7 @@ "multi_card": { "learning_rate": 3e-5, "train_batch_size": 16, - "eval_f1": 0.8897, - "train_runtime": 65.644, - "train_samples_per_second": 919.623, + "metrics": ["eval_f1", "train_runtime", "train_samples_per_second"], "extra_arguments": [ "--max_seq_length 128", "--use_hpu_graphs_for_inference" @@ -65,9 +57,7 @@ "single_card": { "learning_rate": 3e-5, "train_batch_size": 32, - "eval_f1": 93.2753, - "train_runtime": 342.1722, - "train_samples_per_second": 286.435, + "metrics": ["eval_f1", "train_runtime", "train_samples_per_second"], "extra_arguments": [ "--max_seq_length 384", "--use_hpu_graphs_for_inference" @@ -76,9 +66,7 @@ "multi_card": { "learning_rate": 3e-5, "train_batch_size": 32, - "eval_f1": 91.71, - "train_runtime": 80.307, - "train_samples_per_second": 2150.333, + "metrics": ["eval_f1", "train_runtime", "train_samples_per_second"], "extra_arguments": [ "--max_seq_length 384", "--use_hpu_graphs_for_inference" @@ -93,9 +81,7 @@ "single_card": { "learning_rate": 3e-5, "train_batch_size": 256, - "eval_f1": 0.867, - "train_runtime": 33.2909, - "train_samples_per_second": 1100.598, + "metrics": ["eval_f1", "train_runtime", "train_samples_per_second"], "extra_arguments": [ "--max_seq_length 128", "--use_hpu_graphs_for_inference" @@ -104,9 +90,7 @@ "multi_card": { "learning_rate": 3e-5, "train_batch_size": 40, - "eval_f1": 0.8452579034941764, - "train_runtime": 31.445, - "train_samples_per_second": 2845.068, + "metrics": ["eval_f1", "train_runtime", "train_samples_per_second"], "extra_arguments": [ "--max_seq_length 128", "--use_hpu_graphs_for_inference" diff --git a/tests/baselines/bloom_7b1.json b/tests/configs/examples/bloom_7b1.json similarity index 84% rename from tests/baselines/bloom_7b1.json rename to tests/configs/examples/bloom_7b1.json index 37251e8651..9de0a72315 100644 --- a/tests/baselines/bloom_7b1.json +++ b/tests/configs/examples/bloom_7b1.json @@ -1,5 +1,5 @@ { - "gaudi": { + "gaudi1": { "wikitext": { "num_train_epochs": 3, "eval_batch_size": 4, @@ -7,8 +7,7 @@ "deepspeed": { "learning_rate": 1e-4, "train_batch_size": 8, - "train_runtime": 1556.481, - "train_samples_per_second": 4.757, + "metrics": ["train_runtime", "train_samples_per_second"], "extra_arguments": [ "--dataset_config_name wikitext-2-raw-v1", "--use_cache False", @@ -20,4 +19,4 @@ } } } -} \ No newline at end of file +} diff --git a/tests/baselines/bridgetower_large_itm_mlm_itc.json b/tests/configs/examples/bridgetower_large_itm_mlm_itc.json similarity index 90% rename from tests/baselines/bridgetower_large_itm_mlm_itc.json rename to tests/configs/examples/bridgetower_large_itm_mlm_itc.json index 6a1a8540b0..6dce3b79dc 100644 --- a/tests/baselines/bridgetower_large_itm_mlm_itc.json +++ b/tests/configs/examples/bridgetower_large_itm_mlm_itc.json @@ -7,8 +7,7 @@ "multi_card": { "learning_rate": 1e-5, "train_batch_size": 48, - "train_runtime": 224.42, - "train_samples_per_second": 904.93, + "metrics": ["train_runtime", "train_samples_per_second"], "extra_arguments": [ "--dataset_config_name matching", "--dataset_revision 3c6c4f6c0ff7e902833d3afa5f8f3875c2b036e6", @@ -25,4 +24,4 @@ } } } -} \ No newline at end of file +} diff --git a/tests/baselines/chatglm3_6b.json b/tests/configs/examples/chatglm3_6b.json similarity index 87% rename from tests/baselines/chatglm3_6b.json rename to tests/configs/examples/chatglm3_6b.json index 3a8c7a2feb..ce55433e91 100644 --- a/tests/baselines/chatglm3_6b.json +++ b/tests/configs/examples/chatglm3_6b.json @@ -7,9 +7,7 @@ "deepspeed": { "learning_rate": 5e-5, "train_batch_size": 4, - "perplexity": 16.51629, - "train_runtime": 445, - "train_samples_per_second": 18.216, + "metrics": ["perplexity", "train_runtime", "train_samples_per_second"], "extra_arguments": [ "--dataset_name wikitext", "--dataset_config_name wikitext-2-raw-v1", @@ -28,4 +26,4 @@ } } } -} \ No newline at end of file +} diff --git a/tests/baselines/clip_roberta.json b/tests/configs/examples/clip_roberta.json similarity index 90% rename from tests/baselines/clip_roberta.json rename to tests/configs/examples/clip_roberta.json index 0c2dfec435..37e9d1f5cc 100755 --- a/tests/baselines/clip_roberta.json +++ b/tests/configs/examples/clip_roberta.json @@ -1,5 +1,5 @@ { - "gaudi": { + "gaudi1": { "ydshieh/coco_dataset_script": { "num_train_epochs": 1, "eval_batch_size": 64, @@ -7,8 +7,7 @@ "multi_card": { "learning_rate": 5e-5, "train_batch_size": 64, - "train_runtime": 314.7726, - "train_samples_per_second": 2560.999, + "metrics": ["train_runtime", "train_samples_per_second"], "extra_arguments": [ "--data_dir $PWD/", "--dataset_config_name 2017", @@ -35,8 +34,7 @@ "multi_card": { "learning_rate": 5e-05, "train_batch_size": 512, - "train_runtime": 59.50, - "train_samples_per_second": 14124, + "metrics": ["train_runtime", "train_samples_per_second"], "extra_arguments": [ "--data_dir $PWD/", "--dataset_config_name 2017", @@ -57,4 +55,4 @@ } } } -} \ No newline at end of file +} diff --git a/tests/baselines/distilbert_base_uncased.json b/tests/configs/examples/distilbert_base_uncased.json similarity index 71% rename from tests/baselines/distilbert_base_uncased.json rename to tests/configs/examples/distilbert_base_uncased.json index 8678342e7b..0eb215102a 100644 --- a/tests/baselines/distilbert_base_uncased.json +++ b/tests/configs/examples/distilbert_base_uncased.json @@ -1,5 +1,5 @@ { - "gaudi": { + "gaudi1": { "squad": { "num_train_epochs": 1, "eval_batch_size": 8, @@ -7,9 +7,7 @@ "single_card": { "learning_rate": 1e-4, "train_batch_size": 48, - "eval_f1": 84.5384, - "train_runtime": 264.3669, - "train_samples_per_second": 344.126, + "metrics": ["eval_f1", "train_runtime", "train_samples_per_second"], "extra_arguments": [ "--max_seq_length 384", "--use_hpu_graphs_for_inference" @@ -18,9 +16,7 @@ "multi_card": { "learning_rate": 4e-4, "train_batch_size": 48, - "eval_f1": 83.0667, - "train_runtime": 54.5344, - "train_samples_per_second": 2503.657, + "metrics": ["eval_f1", "train_runtime", "train_samples_per_second"], "extra_arguments": [ "--max_seq_length 384", "--use_hpu_graphs_for_inference" @@ -37,9 +33,7 @@ "single_card": { "learning_rate": 2e-4, "train_batch_size": 64, - "eval_f1": 84.4002097183518, - "train_runtime": 136.3135, - "train_samples_per_second": 1329.313, + "metrics": ["eval_f1", "train_runtime", "train_samples_per_second"], "extra_arguments": [ "--max_seq_length 384", "--use_hpu_graphs_for_inference" @@ -48,9 +42,7 @@ "multi_card": { "learning_rate": 3e-4, "train_batch_size": 64, - "eval_f1": 83.15565271833093, - "train_runtime": 25.9614, - "train_samples_per_second": 9259.038, + "metrics": ["eval_f1", "train_runtime", "train_samples_per_second"], "extra_arguments": [ "--max_seq_length 384", "--use_hpu_graphs_for_inference" @@ -59,4 +51,4 @@ } } } -} \ No newline at end of file +} diff --git a/tests/baselines/falcon_40b.json b/tests/configs/examples/falcon_40b.json similarity index 91% rename from tests/baselines/falcon_40b.json rename to tests/configs/examples/falcon_40b.json index f4e26f0a03..73c0ef93be 100644 --- a/tests/baselines/falcon_40b.json +++ b/tests/configs/examples/falcon_40b.json @@ -7,9 +7,7 @@ "multi_card": { "learning_rate": 4e-4, "train_batch_size": 1, - "perplexity": 4.0, - "train_runtime": 550, - "train_samples_per_second": 15.0, + "metrics": ["perplexity", "train_runtime", "train_samples_per_second"], "extra_arguments": [ "--bf16", "--gradient_accumulation_steps 16", @@ -42,9 +40,7 @@ "multi_card": { "learning_rate": 4e-4, "train_batch_size": 1, - "perplexity": 1.6, - "train_runtime": 710, - "train_samples_per_second": 15.0, + "metrics": ["perplexity", "train_runtime", "train_samples_per_second"], "extra_arguments": [ "--bf16", "--gradient_accumulation_steps 16", diff --git a/tests/baselines/flan_t5_xxl.json b/tests/configs/examples/flan_t5_xxl.json similarity index 86% rename from tests/baselines/flan_t5_xxl.json rename to tests/configs/examples/flan_t5_xxl.json index 5c5da7bdf4..3f67ea03b3 100644 --- a/tests/baselines/flan_t5_xxl.json +++ b/tests/configs/examples/flan_t5_xxl.json @@ -7,9 +7,7 @@ "deepspeed": { "learning_rate": 1e-4, "train_batch_size": 22, - "eval_rougeLsum": 29.308, - "train_runtime": 155.86, - "train_samples_per_second": 28.387, + "metrics": ["eval_rougeLsum", "train_runtime", "train_samples_per_second"], "extra_arguments": [ "--max_steps 20", "--max_eval_samples 880", @@ -27,4 +25,4 @@ } } } -} \ No newline at end of file +} diff --git a/tests/baselines/gemma_2b_it.json b/tests/configs/examples/gemma_2b_it.json similarity index 73% rename from tests/baselines/gemma_2b_it.json rename to tests/configs/examples/gemma_2b_it.json index eb92692ed2..cfea562791 100644 --- a/tests/baselines/gemma_2b_it.json +++ b/tests/configs/examples/gemma_2b_it.json @@ -7,9 +7,7 @@ "single_card": { "learning_rate": 2e-4, "train_batch_size": 4, - "perplexity": 26.39, - "train_runtime": 356.07, - "train_samples_per_second": 14.06, + "metrics": ["perplexity", "train_runtime", "train_samples_per_second"], "extra_arguments": [ "--dataset_config_name wikitext-2-raw-v1", "--use_hpu_graphs_for_inference" @@ -18,9 +16,7 @@ "multi_card": { "learning_rate": 8e-4, "train_batch_size": 4, - "perplexity": 954.5995, - "train_runtime": 82.6617, - "train_samples_per_second": 94.524, + "metrics": ["perplexity", "train_runtime", "train_samples_per_second"], "extra_arguments": [ "--dataset_config_name wikitext-2-raw-v1", "--use_hpu_graphs_for_inference" @@ -29,9 +25,7 @@ "deepspeed": { "learning_rate": 8e-4, "train_batch_size": 4, - "perplexity": 924.062, - "train_runtime": 75.518, - "train_samples_per_second": 81.097, + "metrics": ["perplexity", "train_runtime", "train_samples_per_second"], "extra_arguments": [ "--dataset_config_name wikitext-2-raw-v1", "--use_hpu_graphs_for_inference", @@ -41,4 +35,4 @@ } } } -} \ No newline at end of file +} diff --git a/tests/baselines/gemma_2b_it_eager.json b/tests/configs/examples/gemma_2b_it_eager.json similarity index 75% rename from tests/baselines/gemma_2b_it_eager.json rename to tests/configs/examples/gemma_2b_it_eager.json index 54ba546ccc..09808d99d5 100644 --- a/tests/baselines/gemma_2b_it_eager.json +++ b/tests/configs/examples/gemma_2b_it_eager.json @@ -7,9 +7,7 @@ "single_card": { "learning_rate": 2e-4, "train_batch_size": 4, - "perplexity": 26.69, - "train_runtime": 560.8188, - "train_samples_per_second": 8.597, + "metrics": ["perplexity", "train_runtime", "train_samples_per_second"], "extra_arguments": [ "--dataset_config_name wikitext-2-raw-v1" ] @@ -17,4 +15,4 @@ } } } -} \ No newline at end of file +} diff --git a/tests/baselines/gpt2.json b/tests/configs/examples/gpt2.json similarity index 73% rename from tests/baselines/gpt2.json rename to tests/configs/examples/gpt2.json index f293e9325c..747ec83dd8 100644 --- a/tests/baselines/gpt2.json +++ b/tests/configs/examples/gpt2.json @@ -1,5 +1,5 @@ { - "gaudi": { + "gaudi1": { "wikitext": { "num_train_epochs": 2, "eval_batch_size": 4, @@ -7,9 +7,7 @@ "single_card": { "learning_rate": 5e-5, "train_batch_size": 4, - "perplexity": 22.2751, - "train_runtime": 225.2898, - "train_samples_per_second": 21.308, + "metrics": ["perplexity", "train_runtime", "train_samples_per_second"], "extra_arguments": [ "--dataset_config_name wikitext-2-raw-v1", "--use_hpu_graphs_for_inference", @@ -19,9 +17,7 @@ "multi_card": { "learning_rate": 4e-4, "train_batch_size": 4, - "perplexity": 22.2699, - "train_runtime": 68.9627, - "train_samples_per_second": 156.241, + "metrics": ["perplexity", "train_runtime", "train_samples_per_second"], "extra_arguments": [ "--dataset_config_name wikitext-2-raw-v1", "--use_hpu_graphs_for_inference", @@ -39,9 +35,7 @@ "single_card": { "learning_rate": 2e-4, "train_batch_size": 16, - "perplexity": 21.0729, - "train_runtime": 43.9361, - "train_samples_per_second": 130.785, + "metrics": ["perplexity", "train_runtime", "train_samples_per_second"], "extra_arguments": [ "--dataset_config_name wikitext-2-raw-v1", "--use_hpu_graphs_for_inference" @@ -50,9 +44,7 @@ "multi_card": { "learning_rate": 8e-4, "train_batch_size": 16, - "perplexity": 21.795393847747704, - "train_runtime": 26.1248, - "train_samples_per_second": 734.196, + "metrics": ["perplexity", "train_runtime", "train_samples_per_second"], "extra_arguments": [ "--dataset_config_name wikitext-2-raw-v1", "--use_hpu_graphs_for_inference" @@ -61,4 +53,4 @@ } } } -} \ No newline at end of file +} diff --git a/tests/baselines/gpt2_xl.json b/tests/configs/examples/gpt2_xl.json similarity index 86% rename from tests/baselines/gpt2_xl.json rename to tests/configs/examples/gpt2_xl.json index 68651d16e3..eb89da6d27 100644 --- a/tests/baselines/gpt2_xl.json +++ b/tests/configs/examples/gpt2_xl.json @@ -1,5 +1,5 @@ { - "gaudi": { + "gaudi1": { "wikitext": { "num_train_epochs": 2, "eval_batch_size": 4, @@ -10,6 +10,7 @@ "perplexity": 12.6744, "train_runtime": 366.8694, "train_samples_per_second": 16.464, + "metrics": ["perplexity", "train_runtime", "train_samples_per_second"], "extra_arguments": [ "--dataset_config_name wikitext-2-raw-v1", "--use_hpu_graphs_for_inference", @@ -27,9 +28,7 @@ "deepspeed": { "learning_rate": 4e-4, "train_batch_size": 16, - "perplexity": 13.237754028004865, - "train_runtime": 206.5775, - "train_samples_per_second": 95.539, + "metrics": ["perplexity", "train_runtime", "train_samples_per_second"], "extra_arguments": [ "--dataset_config_name wikitext-2-raw-v1", "--gradient_checkpointing", diff --git a/tests/baselines/gpt_neox_20b.json b/tests/configs/examples/gpt_neox_20b.json similarity index 80% rename from tests/baselines/gpt_neox_20b.json rename to tests/configs/examples/gpt_neox_20b.json index cb8664a1db..5a68691c16 100644 --- a/tests/baselines/gpt_neox_20b.json +++ b/tests/configs/examples/gpt_neox_20b.json @@ -7,9 +7,7 @@ "deepspeed": { "learning_rate": 5e-5, "train_batch_size": 2, - "perplexity": 8.169664686471043, - "train_runtime": 445, - "train_samples_per_second": 7.328, + "metrics": ["perplexity", "train_runtime", "train_samples_per_second"], "extra_arguments": [ "--dataset_config_name wikitext-2-raw-v1", "--gradient_checkpointing", @@ -20,4 +18,4 @@ } } } -} \ No newline at end of file +} diff --git a/tests/baselines/idefics2_8b.json b/tests/configs/examples/idefics2_8b.json similarity index 91% rename from tests/baselines/idefics2_8b.json rename to tests/configs/examples/idefics2_8b.json index f40995c72d..c74f37ecee 100644 --- a/tests/baselines/idefics2_8b.json +++ b/tests/configs/examples/idefics2_8b.json @@ -7,9 +7,7 @@ "multi_card": { "learning_rate": 5e-5, "train_batch_size": 2, - "train_runtime": 286, - "train_samples_per_second": 11.8, - "eval_accuracy": 0.6, + "metrics": ["eval_accuracy", "train_runtime", "train_samples_per_second"], "extra_arguments": [ "--bf16", "--gradient_accumulation_steps 8", diff --git a/tests/baselines/llama_7b.json b/tests/configs/examples/llama_7b.json similarity index 89% rename from tests/baselines/llama_7b.json rename to tests/configs/examples/llama_7b.json index dcfd6d3807..29c4a23e0a 100644 --- a/tests/baselines/llama_7b.json +++ b/tests/configs/examples/llama_7b.json @@ -1,5 +1,5 @@ { - "gaudi": { + "gaudi1": { "databricks/databricks-dolly-15k": { "num_train_epochs": 1, "eval_batch_size": 2, @@ -7,9 +7,7 @@ "single_card": { "learning_rate": 2e-4, "train_batch_size": 2, - "perplexity": 3.9168, - "train_runtime": 132.665, - "train_samples_per_second": 2.295, + "metrics": ["perplexity", "train_runtime", "train_samples_per_second"], "extra_arguments": [ "--bf16", "--gradient_accumulation_steps 1", @@ -43,9 +41,7 @@ "multi_card": { "learning_rate": 1e-4, "train_batch_size": 2, - "perplexity": 2.7542, - "train_runtime": 538.0159, - "train_samples_per_second": 20.397, + "metrics": ["perplexity", "train_runtime", "train_samples_per_second"], "extra_arguments": [ "--bf16", "--gradient_accumulation_steps 4", @@ -68,9 +64,7 @@ "single_card": { "learning_rate": 2e-4, "train_batch_size": 16, - "perplexity": 3.8436, - "train_runtime": 113.9713, - "train_samples_per_second": 18.428, + "metrics": ["perplexity", "train_runtime", "train_samples_per_second"], "extra_arguments": [ "--bf16", "--gradient_accumulation_steps 1", @@ -104,9 +98,7 @@ "multi_card": { "learning_rate": 3e-4, "train_batch_size": 8, - "perplexity": 2.3665, - "train_runtime": 294.5707, - "train_samples_per_second": 148.093, + "metrics": ["perplexity", "train_runtime", "train_samples_per_second"], "extra_arguments": [ "--bf16", "--gradient_accumulation_steps 2", @@ -139,9 +131,7 @@ "multi_card": { "learning_rate": 3e-4, "train_batch_size": 8, - "perplexity": 2.3665, - "train_runtime": 294.5707, - "train_samples_per_second": 148.093, + "metrics": ["perplexity", "train_runtime", "train_samples_per_second"], "extra_arguments": [ "--bf16", "--gradient_accumulation_steps 2", @@ -173,9 +163,7 @@ "multi_card": { "learning_rate": 3e-4, "train_batch_size": 8, - "perplexity": 2.4259, - "train_runtime": 186.2483, - "train_samples_per_second": 93.5, + "metrics": ["perplexity", "train_runtime", "train_samples_per_second"], "extra_arguments": [ "--bf16 True", "--gradient_accumulation_steps 2", @@ -212,9 +200,7 @@ "multi_card": { "learning_rate": 3e-4, "train_batch_size": 8, - "perplexity": 5.575, - "train_runtime": 131.7, - "train_samples_per_second": 294, + "metrics": ["perplexity", "train_runtime", "train_samples_per_second"], "extra_arguments": [ "--bf16", "--gradient_accumulation_steps 2", @@ -250,8 +236,7 @@ "multi_card": { "learning_rate": 1e-4, "train_batch_size": 4, - "train_runtime": 206, - "train_samples_per_second": 51.54, + "metrics": ["train_runtime", "train_samples_per_second"], "extra_arguments": [ "--bf16 True", "--gradient_accumulation_steps 2", @@ -281,8 +266,7 @@ "multi_card": { "learning_rate": 5e-4, "train_batch_size": 1, - "train_runtime": 234.6471, - "train_samples_per_second": 13.499, + "metrics": ["train_runtime", "train_samples_per_second"], "extra_arguments": [ "--logging_steps 1", "--lora_r 8", @@ -312,8 +296,7 @@ "multi_card": { "learning_rate": 5e-4, "train_batch_size": 1, - "train_runtime": 250, - "train_samples_per_second": 1.6, + "metrics": ["train_runtime", "train_samples_per_second"], "extra_arguments": [ "--logging_steps 1", "--lora_r 8", @@ -338,8 +321,7 @@ "multi_card": { "learning_rate": 5e-4, "train_batch_size": 8, - "train_runtime": 62, - "train_samples_per_second": 0.50, + "metrics": ["train_runtime", "train_samples_per_second"], "extra_arguments": [ "--lora_r 8", "--lora_alpha 16", @@ -366,9 +348,7 @@ "multi_card": { "learning_rate": 5e-4, "train_batch_size": 1, - "train_runtime": 16.5, - "train_samples_per_second": 63.161, - "perplexity": 1.224, + "metrics": ["perplexity", "train_runtime", "train_samples_per_second"], "extra_arguments": [ "--num_virtual_tokens 8", "--max_seq_length 64", @@ -391,9 +371,7 @@ "multi_card": { "learning_rate": 5e-4, "train_batch_size": 1, - "train_runtime": 16.1, - "train_samples_per_second": 63.249, - "perplexity": 1.172, + "metrics": ["perplexity", "train_runtime", "train_samples_per_second"], "extra_arguments": [ "--num_virtual_tokens 8", "--max_seq_length 64", @@ -416,9 +394,7 @@ "multi_card": { "learning_rate": 5e-4, "train_batch_size": 1, - "train_runtime": 18.7, - "train_samples_per_second": 63.161, - "perplexity": 1.047, + "metrics": ["perplexity", "train_runtime", "train_samples_per_second"], "extra_arguments": [ "--num_virtual_tokens 8", "--max_seq_length 64", @@ -441,9 +417,7 @@ "multi_card": { "learning_rate": 3e-4, "train_batch_size": 16, - "perplexity": 2.3692, - "train_runtime": 411.9935, - "train_samples_per_second": 232.439, + "metrics": ["perplexity", "train_runtime", "train_samples_per_second"], "extra_arguments": [ "--bf16", "--gradient_accumulation_steps 1", @@ -478,9 +452,7 @@ "multi_card": { "learning_rate": 3e-4, "train_batch_size": 8, - "perplexity": 3.3, - "train_runtime": 262.8, - "train_samples_per_second": 161, + "metrics": ["perplexity", "train_runtime", "train_samples_per_second"], "extra_arguments": [ "--bf16", "--gradient_accumulation_steps 2", @@ -511,9 +483,7 @@ "multi_card": { "learning_rate": 3e-4, "train_batch_size": 8, - "perplexity": 2.59, - "train_runtime": 459, - "train_samples_per_second": 107, + "metrics": ["perplexity", "train_runtime", "train_samples_per_second"], "extra_arguments": [ "--bf16", "--gradient_accumulation_steps 2", @@ -552,9 +522,7 @@ "multi_card": { "learning_rate": 1e-2, "train_batch_size": 8, - "perplexity": 9.064502567217577, - "train_runtime": 312.9258, - "train_samples_per_second": 127.305, + "metrics": ["perplexity", "train_runtime", "train_samples_per_second"], "extra_arguments": [ "--bf16", "--gradient_accumulation_steps 1", @@ -585,9 +553,7 @@ "multi_card": { "learning_rate": 3e-4, "train_batch_size": 8, - "perplexity": 2.83, - "train_runtime": 249, - "train_samples_per_second": 165, + "metrics": ["perplexity", "train_runtime", "train_samples_per_second"], "extra_arguments": [ "--bf16", "--gradient_accumulation_steps 2", @@ -618,9 +584,7 @@ "deepspeed": { "learning_rate": 3e-4, "train_batch_size": 8, - "perplexity": 2.8889, - "train_runtime": 147.3597, - "train_samples_per_second": 34.41, + "metrics": ["perplexity", "train_runtime", "train_samples_per_second"], "extra_arguments": [ "--bf16 True", "--gradient_accumulation_steps 4", diff --git a/tests/baselines/llava_1_5_7b_hf.json b/tests/configs/examples/llava_1_5_7b_hf.json similarity index 90% rename from tests/baselines/llava_1_5_7b_hf.json rename to tests/configs/examples/llava_1_5_7b_hf.json index 83480ce8f2..774ca979e0 100644 --- a/tests/baselines/llava_1_5_7b_hf.json +++ b/tests/configs/examples/llava_1_5_7b_hf.json @@ -7,9 +7,7 @@ "multi_card": { "learning_rate": 5e-5, "train_batch_size": 2, - "train_runtime": 118.5782, - "train_samples_per_second": 25.146, - "eval_accuracy": 0.2122, + "metrics": ["eval_accuracy", "train_runtime", "train_samples_per_second"], "extra_arguments": [ "--bf16", "--gradient_accumulation_steps 8", @@ -35,4 +33,4 @@ } } } -} \ No newline at end of file +} diff --git a/tests/baselines/protst_esm1b_for_sequential_classification.json b/tests/configs/examples/protst_esm1b_for_sequential_classification.json similarity index 82% rename from tests/baselines/protst_esm1b_for_sequential_classification.json rename to tests/configs/examples/protst_esm1b_for_sequential_classification.json index 00364acbe3..d80c1dd57d 100644 --- a/tests/baselines/protst_esm1b_for_sequential_classification.json +++ b/tests/configs/examples/protst_esm1b_for_sequential_classification.json @@ -7,9 +7,7 @@ "multi_card": { "learning_rate": 5e-5, "train_batch_size": 32, - "train_runtime": 38.9504, - "train_samples_per_second": 768.648, - "eval_accuracy": 0.5436668594563332, + "metrics": ["eval_accuracy", "train_runtime", "train_samples_per_second"], "extra_arguments": [ "--save_strategy no", "--tokenizer_name facebook/esm1b_t33_650M_UR50S", @@ -23,4 +21,4 @@ } } } -} \ No newline at end of file +} diff --git a/tests/baselines/roberta_base.json b/tests/configs/examples/roberta_base.json similarity index 74% rename from tests/baselines/roberta_base.json rename to tests/configs/examples/roberta_base.json index 1c196fce1b..8409805d8c 100644 --- a/tests/baselines/roberta_base.json +++ b/tests/configs/examples/roberta_base.json @@ -1,5 +1,5 @@ { - "gaudi": { + "gaudi1": { "squad": { "num_train_epochs": 1, "eval_batch_size": 8, @@ -7,9 +7,7 @@ "single_card": { "learning_rate": 3e-5, "train_batch_size": 12, - "eval_f1": 91.9903, - "train_runtime": 599.9343, - "train_samples_per_second": 149.781, + "metrics": ["eval_f1", "train_runtime", "train_samples_per_second"], "extra_arguments": [ "--max_seq_length 384", "--use_hpu_graphs_for_inference" @@ -18,9 +16,7 @@ "multi_card": { "learning_rate": 8e-5, "train_batch_size": 12, - "eval_f1": 91.624, - "train_runtime": 103.5987, - "train_samples_per_second": 1083.304, + "metrics": ["eval_f1", "train_runtime", "train_samples_per_second"], "extra_arguments": [ "--max_seq_length 384", "--use_hpu_graphs_for_inference" @@ -35,9 +31,7 @@ "multi_card": { "learning_rate": 5e-5, "train_batch_size": 24, - "perplexity": 3.6338, - "train_runtime": 43.1541, - "train_samples_per_second": 554.787, + "metrics": ["perplexity", "train_runtime", "train_samples_per_second"], "extra_arguments": [ "--dataset_config_name wikitext-2-raw-v1", "--use_hpu_graphs_for_inference", @@ -55,9 +49,7 @@ "single_card": { "learning_rate": 7e-5, "train_batch_size": 64, - "eval_f1": 91.5253, - "train_runtime": 120.6563, - "train_samples_per_second": 847.504, + "metrics": ["eval_f1", "train_runtime", "train_samples_per_second"], "extra_arguments": [ "--max_seq_length 384", "--use_hpu_graphs_for_inference" @@ -66,9 +58,7 @@ "multi_card": { "learning_rate": 2e-4, "train_batch_size": 64, - "eval_f1": 90.8766, - "train_runtime": 32.2213, - "train_samples_per_second": 6568.625, + "metrics": ["eval_f1", "train_runtime", "train_samples_per_second"], "extra_arguments": [ "--max_seq_length 384", "--use_hpu_graphs_for_inference" @@ -83,9 +73,7 @@ "multi_card": { "learning_rate": 8e-5, "train_batch_size": 32, - "perplexity": 3.6691, - "train_runtime": 12.3633, - "train_samples_per_second": 2758.371, + "metrics": ["perplexity", "train_runtime", "train_samples_per_second"], "extra_arguments": [ "--dataset_config_name wikitext-2-raw-v1", "--use_hpu_graphs_for_inference", diff --git a/tests/baselines/roberta_large.json b/tests/configs/examples/roberta_large.json similarity index 74% rename from tests/baselines/roberta_large.json rename to tests/configs/examples/roberta_large.json index 4d7233e089..90b6dd5dce 100755 --- a/tests/baselines/roberta_large.json +++ b/tests/configs/examples/roberta_large.json @@ -1,5 +1,5 @@ { - "gaudi": { + "gaudi1": { "squad": { "num_train_epochs": 1, "eval_batch_size": 8, @@ -7,9 +7,7 @@ "single_card": { "learning_rate": 3e-5, "train_batch_size": 12, - "eval_f1": 94.2959, - "train_runtime": 1771.3319, - "train_samples_per_second": 50.815, + "metrics": ["eval_f1", "train_runtime", "train_samples_per_second"], "extra_arguments": [ "--max_seq_length 384", "--use_hpu_graphs_for_inference" @@ -18,9 +16,7 @@ "multi_card": { "learning_rate": 8e-5, "train_batch_size": 12, - "eval_f1": 94.2867, - "train_runtime": 304.9084, - "train_samples_per_second": 366.177, + "metrics": ["eval_f1", "train_runtime", "train_samples_per_second"], "extra_arguments": [ "--max_seq_length 384", "--use_hpu_graphs_for_inference" @@ -35,9 +31,7 @@ "multi_card": { "learning_rate": 5e-5, "train_batch_size": 8, - "perplexity": 2.7851, - "train_runtime": 75.0033, - "train_samples_per_second": 217.752, + "metrics": ["perplexity", "train_runtime", "train_samples_per_second"], "extra_arguments": [ "--dataset_config_name wikitext-2-raw-v1", "--use_hpu_graphs_for_inference", @@ -55,9 +49,7 @@ "single_card": { "learning_rate": 3e-5, "train_batch_size": 32, - "eval_f1": 94.5886, - "train_runtime": 361.4789, - "train_samples_per_second": 266.47, + "metrics": ["eval_f1", "train_runtime", "train_samples_per_second"], "extra_arguments": [ "--max_seq_length 384", "--use_hpu_graphs_for_inference" @@ -66,9 +58,7 @@ "multi_card": { "learning_rate": 7e-5, "train_batch_size": 32, - "eval_f1": 94.09, - "train_runtime": 79.333, - "train_samples_per_second": 2138.366, + "metrics": ["eval_f1", "train_runtime", "train_samples_per_second"], "extra_arguments": [ "--max_seq_length 384", "--use_hpu_graphs_for_inference" @@ -83,9 +73,7 @@ "multi_card": { "learning_rate": 7e-5, "train_batch_size": 16, - "perplexity": 2.829522488584474, - "train_runtime": 22.7101, - "train_samples_per_second": 1056.875, + "metrics": ["perplexity", "train_runtime", "train_samples_per_second"], "extra_arguments": [ "--dataset_config_name wikitext-2-raw-v1", "--use_hpu_graphs_for_inference", diff --git a/tests/baselines/swin_base_patch4_window7_224_in22k.json b/tests/configs/examples/swin_base_patch4_window7_224_in22k.json similarity index 81% rename from tests/baselines/swin_base_patch4_window7_224_in22k.json rename to tests/configs/examples/swin_base_patch4_window7_224_in22k.json index 8e0a5c40c3..3f6a6c8693 100644 --- a/tests/baselines/swin_base_patch4_window7_224_in22k.json +++ b/tests/configs/examples/swin_base_patch4_window7_224_in22k.json @@ -1,5 +1,5 @@ { - "gaudi": { + "gaudi1": { "cifar10": { "num_train_epochs": 1, "eval_batch_size": 64, @@ -7,9 +7,7 @@ "single_card": { "learning_rate": 3e-5, "train_batch_size": 64, - "eval_accuracy": 0.9871, - "train_runtime": 246.4134, - "train_samples_per_second": 212.722, + "metrics": ["eval_accuracy", "train_runtime", "train_samples_per_second"], "extra_arguments": [ "--remove_unused_columns False", "--image_column_name img", @@ -24,9 +22,7 @@ "multi_card": { "learning_rate": 2e-4, "train_batch_size": 64, - "eval_accuracy": 0.9819, - "train_runtime": 117.6424, - "train_samples_per_second": 1683.344, + "metrics": ["eval_accuracy", "train_runtime", "train_samples_per_second"], "extra_arguments": [ "--remove_unused_columns False", "--image_column_name img", @@ -49,9 +45,7 @@ "single_card": { "learning_rate": 6e-5, "train_batch_size": 160, - "eval_accuracy": 0.9850666666666666, - "train_runtime": 77.8934, - "train_samples_per_second": 826.766, + "metrics": ["eval_accuracy", "train_runtime", "train_samples_per_second"], "extra_arguments": [ "--remove_unused_columns False", "--image_column_name img", @@ -66,9 +60,7 @@ "multi_card": { "learning_rate": 2e-4, "train_batch_size": 160, - "eval_accuracy": 0.9821, - "train_runtime": 62.9986, - "train_samples_per_second": 6202.525, + "metrics": ["eval_accuracy", "train_runtime", "train_samples_per_second"], "extra_arguments": [ "--remove_unused_columns False", "--image_column_name img", @@ -83,4 +75,4 @@ } } } -} \ No newline at end of file +} diff --git a/tests/baselines/t5_small.json b/tests/configs/examples/t5_small.json similarity index 81% rename from tests/baselines/t5_small.json rename to tests/configs/examples/t5_small.json index 31f9c80ef6..38b1b4f11f 100644 --- a/tests/baselines/t5_small.json +++ b/tests/configs/examples/t5_small.json @@ -1,5 +1,5 @@ { - "gaudi": { + "gaudi1": { "cnn_dailymail": { "num_train_epochs": 1, "eval_batch_size": 4, @@ -7,10 +7,7 @@ "multi_card": { "learning_rate": 5e-5, "train_batch_size": 4, - "eval_rougeLsum": 38.5895, - "train_runtime": 1089.366, - "train_samples_per_second": 267.843, - "eval_samples_per_second": 71.913, + "metrics": ["eval_rougeLsum", "eval_samples_per_second", "train_runtime", "train_samples_per_second"], "extra_arguments": [ "--dataset_config \"3.0.0\"", "--source_prefix \"summarize: \"", @@ -30,9 +27,7 @@ "multi_card": { "learning_rate": 2e-4, "train_batch_size": 16, - "eval_f1": 64.8769, - "train_runtime": 230.6405, - "train_samples_per_second": 1235.893, + "metrics": ["eval_f1", "train_runtime", "train_samples_per_second"], "extra_arguments": [ "--context_column context", "--question_column question", @@ -57,10 +52,7 @@ "multi_card": { "learning_rate": 2e-4, "train_batch_size": 32, - "eval_rougeLsum": 38.5977, - "train_runtime": 162.079, - "train_samples_per_second": 1922.144, - "eval_samples_per_second": 96.797, + "metrics": ["eval_rougeLsum", "eval_samples_per_second", "train_runtime", "train_samples_per_second"], "extra_arguments": [ "--dataset_config \"3.0.0\"", "--source_prefix \"summarize: \"", @@ -80,9 +72,7 @@ "multi_card": { "learning_rate": 2e-3, "train_batch_size": 64, - "eval_f1": 65.83485191703365, - "train_runtime": 53.8295, - "train_samples_per_second": 5686.229, + "metrics": ["eval_f1", "train_runtime", "train_samples_per_second"], "extra_arguments": [ "--context_column context", "--question_column question", @@ -105,9 +95,7 @@ "multi_card": { "learning_rate": 2e-3, "train_batch_size": 64, - "eval_f1": 0.8701550387596899, - "train_runtime": 7.0713, - "train_samples_per_second": 18461.413, + "metrics": ["eval_f1", "train_runtime", "train_samples_per_second"], "extra_arguments": [ "--use_hpu_graphs_for_inference", "--use_hpu_graphs_for_training", @@ -126,9 +114,7 @@ "multi_card": { "learning_rate": 2e-3, "train_batch_size": 8, - "eval_accuracy": 0.38, - "train_runtime": 16, - "train_samples_per_second": 1268, + "metrics": ["eval_accuracy", "train_runtime", "train_samples_per_second"], "extra_arguments": [ "--use_hpu_graphs_for_inference", "--use_hpu_graphs_for_training", @@ -143,4 +129,4 @@ } } } -} \ No newline at end of file +} diff --git a/tests/baselines/vit_base_patch16_224_in21k.json b/tests/configs/examples/vit_base_patch16_224_in21k.json similarity index 81% rename from tests/baselines/vit_base_patch16_224_in21k.json rename to tests/configs/examples/vit_base_patch16_224_in21k.json index 7b27afd29a..1071455031 100644 --- a/tests/baselines/vit_base_patch16_224_in21k.json +++ b/tests/configs/examples/vit_base_patch16_224_in21k.json @@ -1,5 +1,5 @@ { - "gaudi": { + "gaudi1": { "cifar10": { "num_train_epochs": 1, "eval_batch_size": 64, @@ -7,9 +7,7 @@ "single_card": { "learning_rate": 5e-5, "train_batch_size": 64, - "eval_accuracy": 0.9812, - "train_runtime": 136.9418, - "train_samples_per_second": 359.584, + "metrics": ["eval_accuracy", "train_runtime", "train_samples_per_second"], "extra_arguments": [ "--remove_unused_columns False", "--image_column_name img", @@ -23,9 +21,7 @@ "multi_card": { "learning_rate": 2e-4, "train_batch_size": 64, - "eval_accuracy": 0.9803, - "train_runtime": 59.972, - "train_samples_per_second": 2508.955, + "metrics": ["eval_accuracy", "train_runtime", "train_samples_per_second"], "extra_arguments": [ "--remove_unused_columns False", "--image_column_name img", @@ -48,9 +44,7 @@ "single_card": { "learning_rate": 3e-5, "train_batch_size": 128, - "eval_accuracy": 0.9690666666666666, - "train_runtime": 54.9734, - "train_samples_per_second": 870.272, + "metrics": ["eval_accuracy", "train_runtime", "train_samples_per_second"], "extra_arguments": [ "--remove_unused_columns False", "--image_column_name img", @@ -64,9 +58,7 @@ "multi_card": { "learning_rate": 2e-4, "train_batch_size": 128, - "eval_accuracy": 0.9679, - "train_runtime": 23.99, - "train_samples_per_second": 6718.643, + "metrics": ["eval_accuracy", "train_runtime", "train_samples_per_second"], "extra_arguments": [ "--remove_unused_columns False", "--image_column_name img", @@ -81,4 +73,4 @@ } } } -} \ No newline at end of file +} diff --git a/tests/baselines/wav2vec2_base.json b/tests/configs/examples/wav2vec2_base.json similarity index 81% rename from tests/baselines/wav2vec2_base.json rename to tests/configs/examples/wav2vec2_base.json index b187e02d51..3b8d8e2b70 100644 --- a/tests/baselines/wav2vec2_base.json +++ b/tests/configs/examples/wav2vec2_base.json @@ -1,5 +1,5 @@ { - "gaudi": { + "gaudi1": { "common_language": { "num_train_epochs": 10, "eval_batch_size": 64, @@ -7,10 +7,7 @@ "multi_card": { "learning_rate": 5e-4, "train_batch_size": 32, - "eval_accuracy": 0.8013, - "train_runtime": 366.8081, - "train_samples_per_second": 716.385, - "eval_samples_per_second": 329.12, + "metrics": ["eval_accuracy", "eval_samples_per_second", "train_runtime", "train_samples_per_second"], "extra_arguments": [ "--audio_column_name audio", "--label_column_name language", @@ -36,10 +33,7 @@ "multi_card": { "learning_rate": 3e-4, "train_batch_size": 32, - "eval_accuracy": 0.7228, - "train_runtime": 63.4079, - "train_samples_per_second": 2975.844, - "eval_samples_per_second": 3640.021, + "metrics": ["eval_accuracy", "eval_samples_per_second", "train_runtime", "train_samples_per_second"], "extra_arguments": [ "--audio_column_name audio", "--label_column_name language", @@ -57,4 +51,4 @@ } } } -} \ No newline at end of file +} diff --git a/tests/baselines/wav2vec2_large_lv60.json b/tests/configs/examples/wav2vec2_large_lv60.json similarity index 83% rename from tests/baselines/wav2vec2_large_lv60.json rename to tests/configs/examples/wav2vec2_large_lv60.json index 920239618b..3ac83a4638 100644 --- a/tests/baselines/wav2vec2_large_lv60.json +++ b/tests/configs/examples/wav2vec2_large_lv60.json @@ -1,5 +1,5 @@ { - "gaudi": { + "gaudi1": { "regisss/librispeech_asr_for_optimum_habana_ci": { "num_train_epochs": 2, "eval_batch_size": 8, @@ -7,10 +7,7 @@ "multi_card": { "learning_rate": 6e-4, "train_batch_size": 8, - "eval_wer": 0.0496, - "train_runtime": 984.3022, - "train_samples_per_second": 63.043, - "eval_samples_per_second": 54.189, + "metrics": ["eval_wer", "eval_samples_per_second", "train_runtime", "train_samples_per_second"], "extra_arguments": [ "--dataset_config_name clean", "--train_split_name train.100", @@ -36,10 +33,7 @@ "multi_card": { "learning_rate": 4e-4, "train_batch_size": 8, - "eval_wer": 0.11090, - "train_runtime": 308.8036, - "train_samples_per_second": 225.572, - "eval_samples_per_second": 196.665, + "metrics": ["eval_wer", "eval_samples_per_second", "train_runtime", "train_samples_per_second"], "extra_arguments": [ "--dataset_config_name clean", "--train_split_name train.100", diff --git a/tests/baselines/whisper_small.json b/tests/configs/examples/whisper_small.json similarity index 85% rename from tests/baselines/whisper_small.json rename to tests/configs/examples/whisper_small.json index 055d321152..b971a404da 100644 --- a/tests/baselines/whisper_small.json +++ b/tests/configs/examples/whisper_small.json @@ -1,5 +1,5 @@ { - "gaudi": { + "gaudi1": { "mozilla-foundation/common_voice_11_0": { "num_train_epochs": 10, "eval_batch_size": 2, @@ -7,10 +7,7 @@ "multi_card": { "learning_rate": 1e-4, "train_batch_size": 8, - "eval_wer": 2.1133, - "train_runtime": 551.3249, - "train_samples_per_second": 145.59, - "eval_samples_per_second": 6.851, + "metrics": ["eval_wer", "eval_samples_per_second", "train_runtime", "train_samples_per_second"], "extra_arguments": [ "--dataset_config_name hi", "--language hindi", @@ -41,10 +38,7 @@ "multi_card": { "learning_rate": 8e-5, "train_batch_size": 32, - "eval_wer": 0.4693843594009983, - "train_runtime": 380.00, - "train_samples_per_second": 218.0, - "eval_samples_per_second": 31.0, + "metrics": ["eval_wer", "eval_samples_per_second", "train_runtime", "train_samples_per_second"], "extra_arguments": [ "--dataset_config_name hi", "--language hindi", @@ -66,4 +60,4 @@ } } } -} \ No newline at end of file +} diff --git a/tests/test_examples.py b/tests/test_examples.py index 578ba7825e..ed4345633e 100644 --- a/tests/test_examples.py +++ b/tests/test_examples.py @@ -14,6 +14,8 @@ # limitations under the License. import json +import logging +import operator import os import re import subprocess @@ -23,6 +25,7 @@ from typing import Callable, Dict, List, Optional, Tuple, Union from unittest import TestCase +import pytest from transformers import ( CONFIG_MAPPING, MODEL_FOR_AUDIO_CLASSIFICATION_MAPPING, @@ -54,7 +57,7 @@ ) -BASELINE_DIRECTORY = Path(__file__).parent.resolve() / Path("baselines") +CONFIG_DIRECTORY = Path(__file__).parent.resolve() / Path("configs") / Path("examples") # Models should reach at least 99% of their baseline accuracy ACCURACY_PERF_FACTOR = 0.99 # Trainings/Evaluations should last at most 5% longer than the baseline @@ -62,6 +65,7 @@ IS_GAUDI2 = bool("gaudi2" == OH_DEVICE_CONTEXT) +IS_GAUDI1 = bool("gaudi1" == OH_DEVICE_CONTEXT) def _get_supported_models_for_script( @@ -454,29 +458,28 @@ def test(self): self._install_requirements(example_script.parent / "requirements.txt") - # collect baseline from _eager.json if eager_mode is True + # collect test_config from _eager.json if eager_mode is True if self.EAGER_MODE: - baseline_name = model_name.split("/")[-1].replace("-", "_").replace(".", "_") + "_eager" + config_name = model_name.split("/")[-1].replace("-", "_").replace(".", "_") + "_eager" else: - baseline_name = model_name.split("/")[-1].replace("-", "_").replace(".", "_") + config_name = model_name.split("/")[-1].replace("-", "_").replace(".", "_") - path_to_baseline = BASELINE_DIRECTORY / Path(baseline_name).with_suffix(".json") + path_to_config = CONFIG_DIRECTORY / Path(config_name).with_suffix(".json") - with path_to_baseline.open("r") as json_file: - device = "gaudi2" if IS_GAUDI2 else "gaudi" - baseline = json.load(json_file)[device] + with path_to_config.open("r") as json_file: + test_config = json.load(json_file)[OH_DEVICE_CONTEXT] if isinstance(self.TASK_NAME, list): for key in self.TASK_NAME: - if key in baseline: - baseline = baseline[key] + if key in test_config: + test_config = test_config[key] break - if "num_train_epochs" not in baseline: + if "num_train_epochs" not in test_config: raise ValueError( - f"Couldn't find a baseline associated to any of these tasks: {self.TASK_NAME}." + f"Couldn't find a test config associated to any of these tasks: {self.TASK_NAME}." ) self.TASK_NAME = key else: - baseline = baseline[self.TASK_NAME] + test_config = test_config[self.TASK_NAME] distribution = "single_card" if multi_card: @@ -507,7 +510,7 @@ def test(self): if fp8 and "llama" in model_name: env_variables["PT_HPU_AUTOCAST_LOWER_PRECISION_OPS_LIST"] = str(example_script.parent / "ops_bf16.txt") - extra_command_line_arguments = baseline.get("distribution").get(distribution).get("extra_arguments", []) + extra_command_line_arguments = test_config.get("distribution").get(distribution).get("extra_arguments", []) if self.EAGER_MODE: env_variables["PT_HPU_LAZY_MODE"] = "0" @@ -569,10 +572,10 @@ def test(self): gaudi_config_name, tmp_dir, task=self.TASK_NAME, - lr=baseline.get("distribution").get(distribution).get("learning_rate"), - train_batch_size=baseline.get("distribution").get(distribution).get("train_batch_size"), - eval_batch_size=baseline.get("eval_batch_size"), - num_epochs=baseline.get("num_train_epochs"), + lr=test_config.get("distribution").get(distribution).get("learning_rate"), + train_batch_size=test_config.get("distribution").get(distribution).get("train_batch_size"), + eval_batch_size=test_config.get("eval_batch_size"), + num_epochs=test_config.get("num_train_epochs"), extra_command_line_arguments=extra_command_line_arguments, ) print(f"\n\nCommand to test: {' '.join(cmd_line[:])}\n") @@ -585,7 +588,9 @@ def test(self): with open(Path(tmp_dir) / "all_results.json") as fp: results = json.load(fp) # Ensure performance requirements (accuracy, training time) are met - self.assert_no_regression(results, baseline.get("distribution").get(distribution), model_name) + self.assert_no_regression( + results, test_config.get("distribution").get(distribution).get("metrics"), model_name + ) # TODO: is a cleanup of the dataset cache needed? # self._cleanup_dataset_cache() @@ -612,17 +617,24 @@ class ExampleTesterBase(TestCase): DATASET_PARAMETER_NAME = "dataset_name" DATASET_NAME = None REGRESSION_METRICS = { - "eval_f1": (TestCase.assertGreaterEqual, ACCURACY_PERF_FACTOR), - "eval_accuracy": (TestCase.assertGreaterEqual, ACCURACY_PERF_FACTOR), - "perplexity": (TestCase.assertLessEqual, 2 - ACCURACY_PERF_FACTOR), - "eval_rougeLsum": (TestCase.assertGreaterEqual, ACCURACY_PERF_FACTOR), - "train_runtime": (TestCase.assertLessEqual, TIME_PERF_FACTOR), - "eval_wer": (TestCase.assertLessEqual, 2 - ACCURACY_PERF_FACTOR), - "train_samples_per_second": (TestCase.assertGreaterEqual, 2 - TIME_PERF_FACTOR), - "eval_samples_per_second": (TestCase.assertGreaterEqual, 2 - TIME_PERF_FACTOR), + "eval_f1": (operator.ge, ACCURACY_PERF_FACTOR), + "eval_accuracy": (operator.ge, ACCURACY_PERF_FACTOR), + "perplexity": (operator.le, 2 - ACCURACY_PERF_FACTOR), + "eval_rougeLsum": (operator.ge, ACCURACY_PERF_FACTOR), + "train_runtime": (operator.le, TIME_PERF_FACTOR), + "eval_wer": (operator.le, 2 - ACCURACY_PERF_FACTOR), + "train_samples_per_second": (operator.ge, 2 - TIME_PERF_FACTOR), + "eval_samples_per_second": (operator.ge, 2 - TIME_PERF_FACTOR), } EAGER_MODE = False + @pytest.fixture(autouse=True) + def _use_(self, baseline): + """ + https://docs.pytest.org/en/stable/how-to/unittest.html#using-autouse-fixtures-and-accessing-other-fixtures + """ + self.baseline = baseline + def _create_command_line( self, multi_card: bool, @@ -717,53 +729,57 @@ def _install_requirements(self, requirements_filename: Union[str, os.PathLike]): return_code = p.wait() self.assertEqual(return_code, 0) - def assert_no_regression(self, results: Dict, baseline: Dict, model_name: str): + def assert_no_regression(self, results: Dict, metrics: List, model_name: str): """ Assert whether all possible performance requirements are met. Attributes: results (Dict): results of the run to assess - baseline (Dict): baseline to assert whether or not there is regression + metrics (List): metrics to assert whether or not there is regression """ + # Gather all the metrics to assess - metrics_to_assess = [] - for metric_name in self.REGRESSION_METRICS.keys(): - if metric_name in baseline and metric_name in results: - metrics_to_assess.append(metric_name) - # There is no accuracy metric for `run_clip.py`, `run_bridgetower.py` and BLOOM + metrics_to_assess = list(set(self.REGRESSION_METRICS.keys()) & set(metrics) & set(results.keys())) min_number_metrics = 3 + + # There is no accuracy metric for `run_clip.py`, `run_bridgetower.py` and BLOOM if ( self.EXAMPLE_NAME in ["run_clip", "run_bridgetower", "sft", "dpo", "ppo", "reward_modeling"] or "bloom" in model_name ): min_number_metrics = 2 - # Check that at least 3 metrics are assessed: + # Check that at least min_number_metrics are assessed: # training time + throughput + accuracy metric (F1, accuracy, perplexity,...) self.assertGreaterEqual( len(metrics_to_assess), min_number_metrics, ( - f"{len(metrics_to_assess)} asserted metric(s) while at least 3 are expected (throughput + training" - f" time + accuracy). Metrics to assert: {self.REGRESSION_METRICS.keys()}. Metrics received:" - f" {baseline.keys()}" + f"{len(metrics_to_assess)} asserted metric(s) while at least" + f" {min_number_metrics} are expected (throughput + training time + accuracy*)." + f" Metrics to assert: {self.REGRESSION_METRICS.keys()}. Metrics received: {metrics}" ), ) - # Message to display if one test fails - # This enables to show all the results and baselines even if one test fails before others - failure_message = "\n===== Assessed metrics (measured vs thresholded baseline) =====\n" - for metric_name in metrics_to_assess: - failure_message += f"{metric_name}: {results[metric_name]} vs {self.REGRESSION_METRICS[metric_name][1] * baseline[metric_name]}\n" - # Assess metrics + passed = True for metric_name in metrics_to_assess: - assert_function, threshold_factor = self.REGRESSION_METRICS[metric_name] - assert_function( - self, - results[metric_name], - threshold_factor * baseline[metric_name], - msg=f"for metric {metric_name}. {failure_message}", - ) + fn, threshold = self.REGRESSION_METRICS[metric_name] + + def check(actual, ref): + check.msg = f"{metric_name}: {fn.__name__}({actual}, {threshold} * {ref})\n" + return fn(actual, threshold * ref) + + check.msg = "" + + try: + self.baseline.assertRef( + compare=check, context=[OH_DEVICE_CONTEXT], **{metric_name: results[metric_name]} + ) + except Exception: + logging.getLogger().error(check.msg) + passed = False + + assert passed, "One or more metrics failed" class TextClassificationExampleTester(ExampleTesterBase, metaclass=ExampleTestMeta, example_name="run_glue"):