diff --git a/tests/baselines/fixture/tests/test_examples.json b/tests/baselines/fixture/tests/test_examples.json new file mode 100644 index 0000000000..d1820727f8 --- /dev/null +++ b/tests/baselines/fixture/tests/test_examples.json @@ -0,0 +1,440 @@ +{ + "tests/test_examples.py::CausalLanguageModelingExampleTester::test_run_clm_gemma-2b-it_single_card": { + "gaudi2": { + "perplexity": 26.39, + "train_runtime": 356.07, + "train_samples_per_second": 14.06 + } + }, + "tests/test_examples.py::CausalLanguageModelingLORAExampleTester::test_run_lora_clm_llama-7b_single_card": { + "gaudi1": { + "perplexity": 3.9168, + "train_runtime": 132.665, + "train_samples_per_second": 2.295 + }, + "gaudi2": { + "perplexity": 3.8436, + "train_runtime": 113.9713, + "train_samples_per_second": 18.428 + } + }, + "tests/test_examples.py::DeepSpeedTextClassificationExampleTester::test_run_glue_LlamaGuard-7b_deepspeed": { + "gaudi2": { + "eval_f1": 0.8873483535528596, + "train_runtime": 62.4539, + "train_samples_per_second": 342.169 + } + }, + "tests/test_examples.py::DeepspeedCausalLanguageModelingExampleTester::test_run_clm_CodeLlama-13b-Instruct-hf_deepspeed": { + "gaudi2": { + "perplexity": 6.877496628184696, + "train_runtime": 542.2985, + "train_samples_per_second": 18.789 + } + }, + "tests/test_examples.py::DeepspeedCausalLanguageModelingExampleTester::test_run_clm_bloom-7b1_deepspeed": { + "gaudi1": { + "train_runtime": 1556.481, + "train_samples_per_second": 4.757 + } + }, + "tests/test_examples.py::DeepspeedCausalLanguageModelingExampleTester::test_run_clm_chatglm3-6b_deepspeed": { + "gaudi2": { + "perplexity": 16.51629, + "train_runtime": 445, + "train_samples_per_second": 18.216 + } + }, + "tests/test_examples.py::DeepspeedCausalLanguageModelingExampleTester::test_run_clm_gemma-2b-it_deepspeed": { + "gaudi2": { + "perplexity": 924.062, + "train_runtime": 75.518, + "train_samples_per_second": 81.097 + } + }, + "tests/test_examples.py::DeepspeedCausalLanguageModelingExampleTester::test_run_clm_gpt-neox-20b_deepspeed": { + "gaudi2": { + "perplexity": 8.169664686471043, + "train_runtime": 445, + "train_samples_per_second": 7.328 + } + }, + "tests/test_examples.py::DeepspeedCausalLanguageModelingExampleTester::test_run_clm_gpt2-xl_deepspeed": { + "gaudi1": { + "perplexity": 12.6744, + "train_runtime": 366.8694, + "train_samples_per_second": 16.464 + }, + "gaudi2": { + "perplexity": 13.237754028004865, + "train_runtime": 206.5775, + "train_samples_per_second": 95.539 + } + }, + "tests/test_examples.py::DeepspeedSFTExampleTester::test_sft_Qwen2-72B_deepspeed": { + "gaudi2": { + "perplexity": 3.7020898897918824, + "train_runtime": 918.8018, + "train_samples_per_second": 7.554 + } + }, + "tests/test_examples.py::DeepspeedSummarizationExampleTester::test_run_summarization_flan-t5-xxl_deepspeed": { + "gaudi2": { + "eval_rougeLsum": 29.308, + "train_runtime": 155.86, + "train_samples_per_second": 28.387 + } + }, + "tests/test_examples.py::EagerModeCausalLanguageModelingExampleTester::test_run_clm_gemma-2b-it_single_card": { + "gaudi2": { + "perplexity": 26.69, + "train_runtime": 560.8188, + "train_samples_per_second": 8.597 + } + }, + "tests/test_examples.py::ImageClassificationExampleTester::test_run_image_classification_swin-base-patch4-window7-224-in22k_single_card": { + "gaudi1": { + "eval_accuracy": 0.9871, + "train_runtime": 246.4134, + "train_samples_per_second": 212.722 + }, + "gaudi2": { + "eval_accuracy": 0.9850666666666666, + "train_runtime": 77.8934, + "train_samples_per_second": 826.766 + } + }, + "tests/test_examples.py::ImageClassificationExampleTester::test_run_image_classification_vit-base-patch16-224-in21k_single_card": { + "gaudi1": { + "eval_accuracy": 0.9812, + "train_runtime": 136.9418, + "train_samples_per_second": 359.584 + }, + "gaudi2": { + "eval_accuracy": 0.9690666666666666, + "train_runtime": 54.9734, + "train_samples_per_second": 870.272 + } + }, + "tests/test_examples.py::MultiCardAudioClassificationExampleTester::test_run_audio_classification_ast-finetuned-speech-commands-v2_multi_card": { + "gaudi2": { + "eval_accuracy": 0.1871, + "eval_samples_per_second": 2301.088, + "train_runtime": 139.9477, + "train_samples_per_second": 1955.74 + } + }, + "tests/test_examples.py::MultiCardAudioClassificationExampleTester::test_run_audio_classification_wav2vec2-base_multi_card": { + "gaudi1": { + "eval_accuracy": 0.8013, + "eval_samples_per_second": 329.12, + "train_runtime": 366.8081, + "train_samples_per_second": 716.385 + }, + "gaudi2": { + "eval_accuracy": 0.7228, + "eval_samples_per_second": 3640.021, + "train_runtime": 63.4079, + "train_samples_per_second": 2975.844 + } + }, + "tests/test_examples.py::MultiCardBridgetowerExampleTester::test_run_bridgetower_bridgetower-large-itm-mlm-itc_multi_card": { + "gaudi2": { + "train_runtime": 224.42, + "train_samples_per_second": 904.93 + } + }, + "tests/test_examples.py::MultiCardCausalLanguageModelingAdaloraExampleTester::test_run_lora_clm_llama-7b_multi_card": { + "gaudi2": { + "perplexity": 2.59, + "train_runtime": 459, + "train_samples_per_second": 107 + } + }, + "tests/test_examples.py::MultiCardCausalLanguageModelingExampleTester::test_run_clm_gemma-2b-it_multi_card": { + "gaudi2": { + "perplexity": 954.5995, + "train_runtime": 82.6617, + "train_samples_per_second": 94.524 + } + }, + "tests/test_examples.py::MultiCardCausalLanguageModelingIA3ExampleTester::test_run_lora_clm_llama-7b_multi_card": { + "gaudi2": { + "perplexity": 3.3, + "train_runtime": 262.8, + "train_samples_per_second": 161 + } + }, + "tests/test_examples.py::MultiCardCausalLanguageModelingLORAExampleTester2::test_run_lora_clm_falcon-40b_multi_card": { + "gaudi2": { + "perplexity": 1.6, + "train_runtime": 710, + "train_samples_per_second": 15.0 + } + }, + "tests/test_examples.py::MultiCardCausalLanguageModelingLORAExampleTester2::test_run_lora_clm_llama-7b_multi_card": { + "gaudi2": { + "perplexity": 2.3665, + "train_runtime": 294.5707, + "train_samples_per_second": 148.093 + } + }, + "tests/test_examples.py::MultiCardCausalLanguageModelingLORAExampleTester::test_run_lora_clm_falcon-40b_multi_card": { + "gaudi2": { + "perplexity": 4.0, + "train_runtime": 550, + "train_samples_per_second": 15.0 + } + }, + "tests/test_examples.py::MultiCardCausalLanguageModelingLORAExampleTester::test_run_lora_clm_llama-7b_multi_card": { + "gaudi1": { + "perplexity": 2.7542, + "train_runtime": 538.0159, + "train_samples_per_second": 20.397 + }, + "gaudi2": { + "perplexity": 2.3665, + "train_runtime": 294.5707, + "train_samples_per_second": 148.093 + } + }, + "tests/test_examples.py::MultiCardCausalLanguageModelingLORAFSDPCompileExampleTester::test_run_lora_clm_llama-7b_multi_card": { + "gaudi2": { + "perplexity": 2.4259, + "train_runtime": 186.2483, + "train_samples_per_second": 93.5 + } + }, + "tests/test_examples.py::MultiCardCausalLanguageModelingLlamaAdapterExampleTester::test_run_lora_clm_llama-7b_multi_card": { + "gaudi2": { + "perplexity": 5.575, + "train_runtime": 131.7, + "train_samples_per_second": 294 + } + }, + "tests/test_examples.py::MultiCardCausalLanguageModelingLnExampleTester::test_run_lora_clm_llama-7b_multi_card": { + "gaudi2": { + "perplexity": 2.83, + "train_runtime": 249, + "train_samples_per_second": 165 + } + }, + "tests/test_examples.py::MultiCardCausalLanguageModelingLoRACPExampleTester::test_run_lora_clm_llama-7b_deepspeed": { + "gaudi2": { + "perplexity": 2.8889, + "train_runtime": 147.3597, + "train_samples_per_second": 34.41 + } + }, + "tests/test_examples.py::MultiCardCausalLanguageModelingLoRAFP8ExampleTester::test_run_lora_clm_llama-7b_multi_card": { + "gaudi2": { + "perplexity": 2.3692, + "train_runtime": 411.9935, + "train_samples_per_second": 232.439 + } + }, + "tests/test_examples.py::MultiCardCausalLanguageModelingPTuningExampleTester::test_run_prompt_tuning_clm_llama-7b_multi_card": { + "gaudi2": { + "perplexity": 1.047, + "train_runtime": 18.7, + "train_samples_per_second": 63.161 + } + }, + "tests/test_examples.py::MultiCardCausalLanguageModelingPrefixTuningExampleTester::test_run_prompt_tuning_clm_llama-7b_multi_card": { + "gaudi2": { + "perplexity": 1.172, + "train_runtime": 16.1, + "train_samples_per_second": 63.249 + } + }, + "tests/test_examples.py::MultiCardCausalLanguageModelingPromptTuningExampleTester::test_run_prompt_tuning_clm_llama-7b_multi_card": { + "gaudi2": { + "perplexity": 1.224, + "train_runtime": 16.5, + "train_samples_per_second": 63.161 + } + }, + "tests/test_examples.py::MultiCardCausalLanguageModelingVeraExampleTester::test_run_lora_clm_llama-7b_multi_card": { + "gaudi2": { + "perplexity": 9.064502567217577, + "train_runtime": 312.9258, + "train_samples_per_second": 127.305 + } + }, + "tests/test_examples.py::MultiCardDPOExampleTester::test_dpo_llama-7b_multi_card": { + "gaudi2": { + "train_runtime": 234.6471, + "train_samples_per_second": 13.499 + } + }, + "tests/test_examples.py::MultiCardImageClassificationExampleTester::test_run_image_classification_swin-base-patch4-window7-224-in22k_multi_card": { + "gaudi1": { + "eval_accuracy": 0.9819, + "train_runtime": 117.6424, + "train_samples_per_second": 1683.344 + }, + "gaudi2": { + "eval_accuracy": 0.9821, + "train_runtime": 62.9986, + "train_samples_per_second": 6202.525 + } + }, + "tests/test_examples.py::MultiCardImageClassificationExampleTester::test_run_image_classification_vit-base-patch16-224-in21k_multi_card": { + "gaudi1": { + "eval_accuracy": 0.9803, + "train_runtime": 59.972, + "train_samples_per_second": 2508.955 + }, + "gaudi2": { + "eval_accuracy": 0.9679, + "train_runtime": 23.99, + "train_samples_per_second": 6718.643 + } + }, + "tests/test_examples.py::MultiCardImageToTextModelingLoRAExampleTester::test_run_image2text_lora_finetune_Llama-3.2-11B-Vision-Instruct_multi_card": { + "gaudi2": { + "eval_accuracy": 0.6, + "train_runtime": 350, + "train_samples_per_second": 20.48 + } + }, + "tests/test_examples.py::MultiCardImageToTextModelingLoRAExampleTester::test_run_image2text_lora_finetune_idefics2-8b_multi_card": { + "gaudi2": { + "eval_accuracy": 0.6, + "train_runtime": 286, + "train_samples_per_second": 11.8 + } + }, + "tests/test_examples.py::MultiCardImageToTextModelingLoRAExampleTester::test_run_image2text_lora_finetune_llava-1.5-7b-hf_multi_card": { + "gaudi2": { + "eval_accuracy": 0.2122, + "train_runtime": 118.5782, + "train_samples_per_second": 25.146 + } + }, + "tests/test_examples.py::MultiCardMaskedLanguageModelingExampleTester::test_run_mlm_roberta-large_multi_card": { + "gaudi1": { + "perplexity": 2.7851, + "train_runtime": 75.0033, + "train_samples_per_second": 217.752 + }, + "gaudi2": { + "perplexity": 2.829522488584474, + "train_runtime": 22.7101, + "train_samples_per_second": 1056.875 + } + }, + "tests/test_examples.py::MultiCardPPOExampleTester::test_ppo_llama-7b_multi_card": { + "gaudi2": { + "train_runtime": 62, + "train_samples_per_second": 0.5 + } + }, + "tests/test_examples.py::MultiCardProteinFoldingClassificationTester::test_run_sequence_classification_protst-esm1b-for-sequential-classification_multi_card": { + "gaudi2": { + "eval_accuracy": 0.5436668594563332, + "train_runtime": 38.9504, + "train_samples_per_second": 768.648 + } + }, + "tests/test_examples.py::MultiCardQuestionAnsweringExampleTester::test_run_qa_roberta-large_multi_card": { + "gaudi1": { + "eval_f1": 94.2867, + "train_runtime": 304.9084, + "train_samples_per_second": 366.177 + }, + "gaudi2": { + "eval_f1": 94.09, + "train_runtime": 79.333, + "train_samples_per_second": 2138.366 + } + }, + "tests/test_examples.py::MultiCardRewardExampleTester::test_reward_modeling_llama-7b_multi_card": { + "gaudi2": { + "train_runtime": 250, + "train_samples_per_second": 1.6 + } + }, + "tests/test_examples.py::MultiCardSFTChatExampleTester::test_sft_Qwen2-7B_multi_card": { + "gaudi2": { + "train_runtime": 423.995, + "train_samples_per_second": 7.342 + } + }, + "tests/test_examples.py::MultiCardSFTChatPeftExampleTester::test_sft_Qwen2-7B_multi_card": { + "gaudi2": { + "train_runtime": 410, + "train_samples_per_second": 120 + } + }, + "tests/test_examples.py::MultiCardSFTExampleTester::test_sft_llama-7b_multi_card": { + "gaudi2": { + "train_runtime": 206, + "train_samples_per_second": 51.54 + } + }, + "tests/test_examples.py::MultiCardSeq2SeqSpeechRecognitionExampleTester::test_run_speech_recognition_seq2seq_whisper-small_multi_card": { + "gaudi1": { + "eval_samples_per_second": 6.851, + "eval_wer": 2.1133, + "train_runtime": 551.3249, + "train_samples_per_second": 145.59 + }, + "gaudi2": { + "eval_samples_per_second": 31.0, + "eval_wer": 0.4693843594009983, + "train_runtime": 380.0, + "train_samples_per_second": 218.0 + } + }, + "tests/test_examples.py::MultiCardSpeechRecognitionExampleTester::test_run_speech_recognition_ctc_wav2vec2-large-lv60_multi_card": { + "gaudi1": { + "eval_samples_per_second": 54.189, + "eval_wer": 0.0496, + "train_runtime": 984.3022, + "train_samples_per_second": 63.043 + }, + "gaudi2": { + "eval_samples_per_second": 196.665, + "eval_wer": 0.1109, + "train_runtime": 308.8036, + "train_samples_per_second": 225.572 + } + }, + "tests/test_examples.py::MultiCardTextClassificationExampleTester::test_run_glue_bert-large-uncased-whole-word-masking_multi_card": { + "gaudi1": { + "eval_f1": 0.8897, + "train_runtime": 65.644, + "train_samples_per_second": 919.623 + }, + "gaudi2": { + "eval_f1": 0.8452579034941764, + "train_runtime": 31.445, + "train_samples_per_second": 2845.068 + } + }, + "tests/test_examples.py::QuestionAnsweringExampleTester::test_run_qa_roberta-large_single_card": { + "gaudi1": { + "eval_f1": 94.2959, + "train_runtime": 1771.3319, + "train_samples_per_second": 50.815 + }, + "gaudi2": { + "eval_f1": 94.5886, + "train_runtime": 361.4789, + "train_samples_per_second": 266.47 + } + }, + "tests/test_examples.py::TextClassificationExampleTester::test_run_glue_bert-large-uncased-whole-word-masking_single_card": { + "gaudi1": { + "eval_f1": 0.9022, + "train_runtime": 90.3943, + "train_samples_per_second": 172.792 + }, + "gaudi2": { + "eval_f1": 0.867, + "train_runtime": 33.2909, + "train_samples_per_second": 1100.598 + } + } +} \ No newline at end of file diff --git a/tests/baselines/CodeLlama_13b_Instruct_hf.json b/tests/configs/examples/CodeLlama_13b_Instruct_hf.json similarity index 100% rename from tests/baselines/CodeLlama_13b_Instruct_hf.json rename to tests/configs/examples/CodeLlama_13b_Instruct_hf.json diff --git a/tests/baselines/LlamaGuard_7b.json b/tests/configs/examples/LlamaGuard_7b.json similarity index 100% rename from tests/baselines/LlamaGuard_7b.json rename to tests/configs/examples/LlamaGuard_7b.json diff --git a/tests/baselines/Llama_3_1_8B.json b/tests/configs/examples/Llama_3_1_8B.json similarity index 100% rename from tests/baselines/Llama_3_1_8B.json rename to tests/configs/examples/Llama_3_1_8B.json diff --git a/tests/baselines/Llama_3_2_11B_Vision_Instruct.json b/tests/configs/examples/Llama_3_2_11B_Vision_Instruct.json similarity index 100% rename from tests/baselines/Llama_3_2_11B_Vision_Instruct.json rename to tests/configs/examples/Llama_3_2_11B_Vision_Instruct.json diff --git a/tests/baselines/Qwen2_72B.json b/tests/configs/examples/Qwen2_72B.json similarity index 100% rename from tests/baselines/Qwen2_72B.json rename to tests/configs/examples/Qwen2_72B.json diff --git a/tests/baselines/Qwen2_7B.json b/tests/configs/examples/Qwen2_7B.json similarity index 100% rename from tests/baselines/Qwen2_7B.json rename to tests/configs/examples/Qwen2_7B.json diff --git a/tests/baselines/albert_large_v2.json b/tests/configs/examples/albert_large_v2.json similarity index 99% rename from tests/baselines/albert_large_v2.json rename to tests/configs/examples/albert_large_v2.json index 2f13722a95..1c4b9a945a 100644 --- a/tests/baselines/albert_large_v2.json +++ b/tests/configs/examples/albert_large_v2.json @@ -1,5 +1,5 @@ { - "gaudi": { + "gaudi1": { "squad": { "num_train_epochs": 2, "eval_batch_size": 4, @@ -59,4 +59,4 @@ } } } -} \ No newline at end of file +} diff --git a/tests/baselines/albert_xxlarge_v1.json b/tests/configs/examples/albert_xxlarge_v1.json similarity index 99% rename from tests/baselines/albert_xxlarge_v1.json rename to tests/configs/examples/albert_xxlarge_v1.json index 30f4fca526..bed9591e40 100644 --- a/tests/baselines/albert_xxlarge_v1.json +++ b/tests/configs/examples/albert_xxlarge_v1.json @@ -1,5 +1,5 @@ { - "gaudi": { + "gaudi1": { "squad": { "num_train_epochs": 1, "eval_batch_size": 2, diff --git a/tests/baselines/ast_finetuned_speech_commands_v2.json b/tests/configs/examples/ast_finetuned_speech_commands_v2.json similarity index 100% rename from tests/baselines/ast_finetuned_speech_commands_v2.json rename to tests/configs/examples/ast_finetuned_speech_commands_v2.json diff --git a/tests/baselines/bert_base_uncased.json b/tests/configs/examples/bert_base_uncased.json similarity index 100% rename from tests/baselines/bert_base_uncased.json rename to tests/configs/examples/bert_base_uncased.json diff --git a/tests/baselines/bert_large_uncased_whole_word_masking.json b/tests/configs/examples/bert_large_uncased_whole_word_masking.json similarity index 99% rename from tests/baselines/bert_large_uncased_whole_word_masking.json rename to tests/configs/examples/bert_large_uncased_whole_word_masking.json index 605e719faf..e90e142262 100755 --- a/tests/baselines/bert_large_uncased_whole_word_masking.json +++ b/tests/configs/examples/bert_large_uncased_whole_word_masking.json @@ -1,5 +1,5 @@ { - "gaudi": { + "gaudi1": { "squad": { "num_train_epochs": 1, "eval_batch_size": 8, diff --git a/tests/baselines/bloom_7b1.json b/tests/configs/examples/bloom_7b1.json similarity index 97% rename from tests/baselines/bloom_7b1.json rename to tests/configs/examples/bloom_7b1.json index 37251e8651..7b71b3a62f 100644 --- a/tests/baselines/bloom_7b1.json +++ b/tests/configs/examples/bloom_7b1.json @@ -1,5 +1,5 @@ { - "gaudi": { + "gaudi1": { "wikitext": { "num_train_epochs": 3, "eval_batch_size": 4, diff --git a/tests/baselines/bridgetower_large_itm_mlm_itc.json b/tests/configs/examples/bridgetower_large_itm_mlm_itc.json similarity index 100% rename from tests/baselines/bridgetower_large_itm_mlm_itc.json rename to tests/configs/examples/bridgetower_large_itm_mlm_itc.json diff --git a/tests/baselines/chatglm3_6b.json b/tests/configs/examples/chatglm3_6b.json similarity index 100% rename from tests/baselines/chatglm3_6b.json rename to tests/configs/examples/chatglm3_6b.json diff --git a/tests/baselines/clip_roberta.json b/tests/configs/examples/clip_roberta.json similarity index 99% rename from tests/baselines/clip_roberta.json rename to tests/configs/examples/clip_roberta.json index 0c2dfec435..d6f6f9bd89 100755 --- a/tests/baselines/clip_roberta.json +++ b/tests/configs/examples/clip_roberta.json @@ -1,5 +1,5 @@ { - "gaudi": { + "gaudi1": { "ydshieh/coco_dataset_script": { "num_train_epochs": 1, "eval_batch_size": 64, diff --git a/tests/baselines/distilbert_base_uncased.json b/tests/configs/examples/distilbert_base_uncased.json similarity index 99% rename from tests/baselines/distilbert_base_uncased.json rename to tests/configs/examples/distilbert_base_uncased.json index 8678342e7b..a53c764cc0 100644 --- a/tests/baselines/distilbert_base_uncased.json +++ b/tests/configs/examples/distilbert_base_uncased.json @@ -1,5 +1,5 @@ { - "gaudi": { + "gaudi1": { "squad": { "num_train_epochs": 1, "eval_batch_size": 8, diff --git a/tests/baselines/falcon_40b.json b/tests/configs/examples/falcon_40b.json similarity index 100% rename from tests/baselines/falcon_40b.json rename to tests/configs/examples/falcon_40b.json diff --git a/tests/baselines/flan_t5_xxl.json b/tests/configs/examples/flan_t5_xxl.json similarity index 100% rename from tests/baselines/flan_t5_xxl.json rename to tests/configs/examples/flan_t5_xxl.json diff --git a/tests/baselines/gemma_2b_it.json b/tests/configs/examples/gemma_2b_it.json similarity index 100% rename from tests/baselines/gemma_2b_it.json rename to tests/configs/examples/gemma_2b_it.json diff --git a/tests/baselines/gemma_2b_it_eager.json b/tests/configs/examples/gemma_2b_it_eager.json similarity index 100% rename from tests/baselines/gemma_2b_it_eager.json rename to tests/configs/examples/gemma_2b_it_eager.json diff --git a/tests/baselines/gpt2.json b/tests/configs/examples/gpt2.json similarity index 99% rename from tests/baselines/gpt2.json rename to tests/configs/examples/gpt2.json index f293e9325c..4c9e89344e 100644 --- a/tests/baselines/gpt2.json +++ b/tests/configs/examples/gpt2.json @@ -1,5 +1,5 @@ { - "gaudi": { + "gaudi1": { "wikitext": { "num_train_epochs": 2, "eval_batch_size": 4, diff --git a/tests/baselines/gpt2_xl.json b/tests/configs/examples/gpt2_xl.json similarity index 98% rename from tests/baselines/gpt2_xl.json rename to tests/configs/examples/gpt2_xl.json index 68651d16e3..1235d46bfe 100644 --- a/tests/baselines/gpt2_xl.json +++ b/tests/configs/examples/gpt2_xl.json @@ -1,5 +1,5 @@ { - "gaudi": { + "gaudi1": { "wikitext": { "num_train_epochs": 2, "eval_batch_size": 4, diff --git a/tests/baselines/gpt_neox_20b.json b/tests/configs/examples/gpt_neox_20b.json similarity index 100% rename from tests/baselines/gpt_neox_20b.json rename to tests/configs/examples/gpt_neox_20b.json diff --git a/tests/baselines/idefics2_8b.json b/tests/configs/examples/idefics2_8b.json similarity index 100% rename from tests/baselines/idefics2_8b.json rename to tests/configs/examples/idefics2_8b.json diff --git a/tests/baselines/llama_7b.json b/tests/configs/examples/llama_7b.json similarity index 99% rename from tests/baselines/llama_7b.json rename to tests/configs/examples/llama_7b.json index dcfd6d3807..7ce34edaee 100644 --- a/tests/baselines/llama_7b.json +++ b/tests/configs/examples/llama_7b.json @@ -1,5 +1,5 @@ { - "gaudi": { + "gaudi1": { "databricks/databricks-dolly-15k": { "num_train_epochs": 1, "eval_batch_size": 2, diff --git a/tests/baselines/llava_1_5_7b_hf.json b/tests/configs/examples/llava_1_5_7b_hf.json similarity index 100% rename from tests/baselines/llava_1_5_7b_hf.json rename to tests/configs/examples/llava_1_5_7b_hf.json diff --git a/tests/baselines/protst_esm1b_for_sequential_classification.json b/tests/configs/examples/protst_esm1b_for_sequential_classification.json similarity index 100% rename from tests/baselines/protst_esm1b_for_sequential_classification.json rename to tests/configs/examples/protst_esm1b_for_sequential_classification.json diff --git a/tests/baselines/roberta_base.json b/tests/configs/examples/roberta_base.json similarity index 99% rename from tests/baselines/roberta_base.json rename to tests/configs/examples/roberta_base.json index 1c196fce1b..affe106e5f 100644 --- a/tests/baselines/roberta_base.json +++ b/tests/configs/examples/roberta_base.json @@ -1,5 +1,5 @@ { - "gaudi": { + "gaudi1": { "squad": { "num_train_epochs": 1, "eval_batch_size": 8, diff --git a/tests/baselines/roberta_large.json b/tests/configs/examples/roberta_large.json similarity index 99% rename from tests/baselines/roberta_large.json rename to tests/configs/examples/roberta_large.json index 4d7233e089..111dfa4533 100755 --- a/tests/baselines/roberta_large.json +++ b/tests/configs/examples/roberta_large.json @@ -1,5 +1,5 @@ { - "gaudi": { + "gaudi1": { "squad": { "num_train_epochs": 1, "eval_batch_size": 8, diff --git a/tests/baselines/swin_base_patch4_window7_224_in22k.json b/tests/configs/examples/swin_base_patch4_window7_224_in22k.json similarity index 99% rename from tests/baselines/swin_base_patch4_window7_224_in22k.json rename to tests/configs/examples/swin_base_patch4_window7_224_in22k.json index 8e0a5c40c3..27efbd30a2 100644 --- a/tests/baselines/swin_base_patch4_window7_224_in22k.json +++ b/tests/configs/examples/swin_base_patch4_window7_224_in22k.json @@ -1,5 +1,5 @@ { - "gaudi": { + "gaudi1": { "cifar10": { "num_train_epochs": 1, "eval_batch_size": 64, diff --git a/tests/baselines/t5_small.json b/tests/configs/examples/t5_small.json similarity index 99% rename from tests/baselines/t5_small.json rename to tests/configs/examples/t5_small.json index 31f9c80ef6..b6d058a510 100644 --- a/tests/baselines/t5_small.json +++ b/tests/configs/examples/t5_small.json @@ -1,5 +1,5 @@ { - "gaudi": { + "gaudi1": { "cnn_dailymail": { "num_train_epochs": 1, "eval_batch_size": 4, diff --git a/tests/baselines/vit_base_patch16_224_in21k.json b/tests/configs/examples/vit_base_patch16_224_in21k.json similarity index 99% rename from tests/baselines/vit_base_patch16_224_in21k.json rename to tests/configs/examples/vit_base_patch16_224_in21k.json index 7b27afd29a..679640cf64 100644 --- a/tests/baselines/vit_base_patch16_224_in21k.json +++ b/tests/configs/examples/vit_base_patch16_224_in21k.json @@ -1,5 +1,5 @@ { - "gaudi": { + "gaudi1": { "cifar10": { "num_train_epochs": 1, "eval_batch_size": 64, diff --git a/tests/baselines/wav2vec2_base.json b/tests/configs/examples/wav2vec2_base.json similarity index 99% rename from tests/baselines/wav2vec2_base.json rename to tests/configs/examples/wav2vec2_base.json index b187e02d51..a4f76a5a39 100644 --- a/tests/baselines/wav2vec2_base.json +++ b/tests/configs/examples/wav2vec2_base.json @@ -1,5 +1,5 @@ { - "gaudi": { + "gaudi1": { "common_language": { "num_train_epochs": 10, "eval_batch_size": 64, diff --git a/tests/baselines/wav2vec2_large_lv60.json b/tests/configs/examples/wav2vec2_large_lv60.json similarity index 99% rename from tests/baselines/wav2vec2_large_lv60.json rename to tests/configs/examples/wav2vec2_large_lv60.json index 920239618b..862122ebb0 100644 --- a/tests/baselines/wav2vec2_large_lv60.json +++ b/tests/configs/examples/wav2vec2_large_lv60.json @@ -1,5 +1,5 @@ { - "gaudi": { + "gaudi1": { "regisss/librispeech_asr_for_optimum_habana_ci": { "num_train_epochs": 2, "eval_batch_size": 8, diff --git a/tests/baselines/whisper_small.json b/tests/configs/examples/whisper_small.json similarity index 99% rename from tests/baselines/whisper_small.json rename to tests/configs/examples/whisper_small.json index 055d321152..fac096950e 100644 --- a/tests/baselines/whisper_small.json +++ b/tests/configs/examples/whisper_small.json @@ -1,5 +1,5 @@ { - "gaudi": { + "gaudi1": { "mozilla-foundation/common_voice_11_0": { "num_train_epochs": 10, "eval_batch_size": 2, diff --git a/tests/test_examples.py b/tests/test_examples.py index 578ba7825e..eabfc2bc57 100644 --- a/tests/test_examples.py +++ b/tests/test_examples.py @@ -14,7 +14,10 @@ # limitations under the License. import json +import logging +import operator import os +import pytest import re import subprocess from distutils.util import strtobool @@ -54,7 +57,7 @@ ) -BASELINE_DIRECTORY = Path(__file__).parent.resolve() / Path("baselines") +CONFIG_DIRECTORY = Path(__file__).parent.resolve() / Path("configs") / Path("examples") # Models should reach at least 99% of their baseline accuracy ACCURACY_PERF_FACTOR = 0.99 # Trainings/Evaluations should last at most 5% longer than the baseline @@ -62,7 +65,7 @@ IS_GAUDI2 = bool("gaudi2" == OH_DEVICE_CONTEXT) - +IS_GAUDI1 = bool("gaudi1" == OH_DEVICE_CONTEXT) def _get_supported_models_for_script( models_to_test: Dict[str, List[Tuple[str]]], @@ -454,29 +457,28 @@ def test(self): self._install_requirements(example_script.parent / "requirements.txt") - # collect baseline from _eager.json if eager_mode is True + # collect test_config from _eager.json if eager_mode is True if self.EAGER_MODE: - baseline_name = model_name.split("/")[-1].replace("-", "_").replace(".", "_") + "_eager" + config_name = model_name.split("/")[-1].replace("-", "_").replace(".", "_") + "_eager" else: - baseline_name = model_name.split("/")[-1].replace("-", "_").replace(".", "_") + config_name = model_name.split("/")[-1].replace("-", "_").replace(".", "_") - path_to_baseline = BASELINE_DIRECTORY / Path(baseline_name).with_suffix(".json") + path_to_config = CONFIG_DIRECTORY / Path(config_name).with_suffix(".json") - with path_to_baseline.open("r") as json_file: - device = "gaudi2" if IS_GAUDI2 else "gaudi" - baseline = json.load(json_file)[device] + with path_to_config.open("r") as json_file: + test_config = json.load(json_file)[OH_DEVICE_CONTEXT] if isinstance(self.TASK_NAME, list): for key in self.TASK_NAME: - if key in baseline: - baseline = baseline[key] + if key in test_config: + test_config = test_config[key] break - if "num_train_epochs" not in baseline: + if "num_train_epochs" not in test_config: raise ValueError( - f"Couldn't find a baseline associated to any of these tasks: {self.TASK_NAME}." + f"Couldn't find a test config associated to any of these tasks: {self.TASK_NAME}." ) self.TASK_NAME = key else: - baseline = baseline[self.TASK_NAME] + test_config = test_config[self.TASK_NAME] distribution = "single_card" if multi_card: @@ -507,7 +509,7 @@ def test(self): if fp8 and "llama" in model_name: env_variables["PT_HPU_AUTOCAST_LOWER_PRECISION_OPS_LIST"] = str(example_script.parent / "ops_bf16.txt") - extra_command_line_arguments = baseline.get("distribution").get(distribution).get("extra_arguments", []) + extra_command_line_arguments = test_config.get("distribution").get(distribution).get("extra_arguments", []) if self.EAGER_MODE: env_variables["PT_HPU_LAZY_MODE"] = "0" @@ -569,10 +571,10 @@ def test(self): gaudi_config_name, tmp_dir, task=self.TASK_NAME, - lr=baseline.get("distribution").get(distribution).get("learning_rate"), - train_batch_size=baseline.get("distribution").get(distribution).get("train_batch_size"), - eval_batch_size=baseline.get("eval_batch_size"), - num_epochs=baseline.get("num_train_epochs"), + lr=test_config.get("distribution").get(distribution).get("learning_rate"), + train_batch_size=test_config.get("distribution").get(distribution).get("train_batch_size"), + eval_batch_size=test_config.get("eval_batch_size"), + num_epochs=test_config.get("num_train_epochs"), extra_command_line_arguments=extra_command_line_arguments, ) print(f"\n\nCommand to test: {' '.join(cmd_line[:])}\n") @@ -585,7 +587,7 @@ def test(self): with open(Path(tmp_dir) / "all_results.json") as fp: results = json.load(fp) # Ensure performance requirements (accuracy, training time) are met - self.assert_no_regression(results, baseline.get("distribution").get(distribution), model_name) + self.assert_no_regression(results, test_config.get("distribution").get(distribution).get("metrics"), model_name) # TODO: is a cleanup of the dataset cache needed? # self._cleanup_dataset_cache() @@ -612,17 +614,24 @@ class ExampleTesterBase(TestCase): DATASET_PARAMETER_NAME = "dataset_name" DATASET_NAME = None REGRESSION_METRICS = { - "eval_f1": (TestCase.assertGreaterEqual, ACCURACY_PERF_FACTOR), - "eval_accuracy": (TestCase.assertGreaterEqual, ACCURACY_PERF_FACTOR), - "perplexity": (TestCase.assertLessEqual, 2 - ACCURACY_PERF_FACTOR), - "eval_rougeLsum": (TestCase.assertGreaterEqual, ACCURACY_PERF_FACTOR), - "train_runtime": (TestCase.assertLessEqual, TIME_PERF_FACTOR), - "eval_wer": (TestCase.assertLessEqual, 2 - ACCURACY_PERF_FACTOR), - "train_samples_per_second": (TestCase.assertGreaterEqual, 2 - TIME_PERF_FACTOR), - "eval_samples_per_second": (TestCase.assertGreaterEqual, 2 - TIME_PERF_FACTOR), + "eval_f1": (operator.ge, ACCURACY_PERF_FACTOR), + "eval_accuracy": (operator.ge, ACCURACY_PERF_FACTOR), + "perplexity": (operator.le, 2 - ACCURACY_PERF_FACTOR), + "eval_rougeLsum": (operator.ge, ACCURACY_PERF_FACTOR), + "train_runtime": (operator.le, TIME_PERF_FACTOR), + "eval_wer": (operator.le, 2 - ACCURACY_PERF_FACTOR), + "train_samples_per_second": (operator.ge, 2 - TIME_PERF_FACTOR), + "eval_samples_per_second": (operator.ge, 2 - TIME_PERF_FACTOR), } EAGER_MODE = False + @pytest.fixture(autouse=True) + def _use_(self, baseline): + """ + https://docs.pytest.org/en/stable/how-to/unittest.html#using-autouse-fixtures-and-accessing-other-fixtures + """ + self.baseline = baseline + def _create_command_line( self, multi_card: bool, @@ -717,7 +726,7 @@ def _install_requirements(self, requirements_filename: Union[str, os.PathLike]): return_code = p.wait() self.assertEqual(return_code, 0) - def assert_no_regression(self, results: Dict, baseline: Dict, model_name: str): + def assert_no_regression(self, results: Dict, metrics: list, model_name: str): """ Assert whether all possible performance requirements are met. Attributes: @@ -725,12 +734,10 @@ def assert_no_regression(self, results: Dict, baseline: Dict, model_name: str): baseline (Dict): baseline to assert whether or not there is regression """ # Gather all the metrics to assess - metrics_to_assess = [] - for metric_name in self.REGRESSION_METRICS.keys(): - if metric_name in baseline and metric_name in results: - metrics_to_assess.append(metric_name) - # There is no accuracy metric for `run_clip.py`, `run_bridgetower.py` and BLOOM + metrics_to_assess = list(set(self.REGRESSION_METRICS.keys()) & set(metrics) & set(results.keys())) min_number_metrics = 3 + + # There is no accuracy metric for `run_clip.py`, `run_bridgetower.py` and BLOOM if ( self.EXAMPLE_NAME in ["run_clip", "run_bridgetower", "sft", "dpo", "ppo", "reward_modeling"] or "bloom" in model_name @@ -745,25 +752,26 @@ def assert_no_regression(self, results: Dict, baseline: Dict, model_name: str): ( f"{len(metrics_to_assess)} asserted metric(s) while at least 3 are expected (throughput + training" f" time + accuracy). Metrics to assert: {self.REGRESSION_METRICS.keys()}. Metrics received:" - f" {baseline.keys()}" + f" {metrics}" ), ) - # Message to display if one test fails - # This enables to show all the results and baselines even if one test fails before others - failure_message = "\n===== Assessed metrics (measured vs thresholded baseline) =====\n" - for metric_name in metrics_to_assess: - failure_message += f"{metric_name}: {results[metric_name]} vs {self.REGRESSION_METRICS[metric_name][1] * baseline[metric_name]}\n" - # Assess metrics + passed = True for metric_name in metrics_to_assess: - assert_function, threshold_factor = self.REGRESSION_METRICS[metric_name] - assert_function( - self, - results[metric_name], - threshold_factor * baseline[metric_name], - msg=f"for metric {metric_name}. {failure_message}", - ) + fn, threshold = self.REGRESSION_METRICS[metric_name] + def check(actual, ref): + check.msg = f"{metric_name}: {fn.__name__}({actual}, {threshold} * {ref})\n" + return fn(actual, threshold * ref) + check.msg = "" + + try: + self.baseline.assertRef(compare=check, context=[OH_DEVICE_CONTEXT], **{metric_name:results[metric_name]}) + except Exception as e: + logging.getLogger().error(check.msg) + passed = False + + assert passed, f"One or more metrics failed" class TextClassificationExampleTester(ExampleTesterBase, metaclass=ExampleTestMeta, example_name="run_glue"):