From 6394293a502d12f82225eefb4fec44ef7986dbe9 Mon Sep 17 00:00:00 2001 From: "U. Artie Eoff" Date: Thu, 27 Feb 2025 16:43:34 -0500 Subject: [PATCH] Clone gaudi2 refs to gaudi3 Start with the same references on gaudi3 as gaudi2. Then, we can incrementally update them as needed. Signed-off-by: U. Artie Eoff --- .../fixture/tests/test_bnb_qlora.json | 3 + .../fixture/tests/test_diffusers.json | 48 ++ .../fixture/tests/test_encoder_decoder.json | 12 + .../fixture/tests/test_examples.json | 252 ++++++ .../tests/test_feature_extraction.json | 3 + .../fixture/tests/test_fp8_examples.json | 4 + .../fixture/tests/test_fsdp_examples.json | 8 + .../tests/test_image_classification.json | 3 + .../tests/test_image_segmentation.json | 3 + .../tests/test_image_to_text_example.json | 48 ++ .../fixture/tests/test_object_detection.json | 6 + .../tests/test_object_segmentation.json | 3 + .../fixture/tests/test_openclip_vqa.json | 6 + .../tests/test_sentence_transformers.json | 39 + .../fixture/tests/test_table_transformer.json | 3 + .../tests/test_text_generation_example.json | 222 +++++ .../fixture/tests/test_video_llava.json | 3 + .../fixture/tests/test_video_mae.json | 3 + .../test_zero_shot_object_detection.json | 3 + .../examples/CodeLlama_13b_Instruct_hf.json | 33 +- tests/configs/examples/LlamaGuard_7b.json | 33 +- tests/configs/examples/Llama_3_1_8B.json | 45 +- .../Llama_3_2_11B_Vision_Instruct.json | 48 +- tests/configs/examples/Qwen2_72B.json | 57 +- tests/configs/examples/Qwen2_7B.json | 92 ++- tests/configs/examples/albert_large_v2.json | 68 +- tests/configs/examples/albert_xxlarge_v1.json | 68 +- .../ast_finetuned_speech_commands_v2.json | 43 +- tests/configs/examples/bert_base_uncased.json | 34 +- ...bert_large_uncased_whole_word_masking.json | 132 ++- tests/configs/examples/bloom_7b1.json | 9 +- .../bridgetower_large_itm_mlm_itc.json | 37 +- tests/configs/examples/chatglm3_6b.json | 41 +- tests/configs/examples/clip_roberta.json | 46 +- .../examples/distilbert_base_uncased.json | 68 +- tests/configs/examples/falcon_40b.json | 95 ++- tests/configs/examples/flan_t5_xxl.json | 40 +- tests/configs/examples/gemma_2b_it.json | 74 +- tests/configs/examples/gemma_2b_it_eager.json | 30 +- tests/configs/examples/gpt2.json | 68 +- tests/configs/examples/gpt2_xl.json | 41 +- tests/configs/examples/gpt_neox_20b.json | 33 +- tests/configs/examples/idefics2_8b.json | 48 +- tests/configs/examples/llama_7b.json | 776 +++++++++++++++++- tests/configs/examples/llava_1_5_7b_hf.json | 48 +- ...t_esm1b_for_sequential_classification.json | 36 +- tests/configs/examples/roberta_base.json | 104 ++- tests/configs/examples/roberta_large.json | 104 ++- .../swin_base_patch4_window7_224_in22k.json | 80 +- tests/configs/examples/t5_small.json | 154 +++- .../examples/vit_base_patch16_224_in21k.json | 79 +- tests/configs/examples/wav2vec2_base.json | 51 +- .../configs/examples/wav2vec2_large_lv60.json | 57 +- tests/configs/examples/whisper_small.json | 55 +- 54 files changed, 3250 insertions(+), 249 deletions(-) diff --git a/tests/baselines/fixture/tests/test_bnb_qlora.json b/tests/baselines/fixture/tests/test_bnb_qlora.json index f917167fde..ddaaec170d 100644 --- a/tests/baselines/fixture/tests/test_bnb_qlora.json +++ b/tests/baselines/fixture/tests/test_bnb_qlora.json @@ -2,6 +2,9 @@ "tests/test_bnb_qlora.py::test_nf4_quantization_inference": { "gaudi2": { "eval_loss": 1.638 + }, + "gaudi3": { + "eval_loss": 1.638 } } } \ No newline at end of file diff --git a/tests/baselines/fixture/tests/test_diffusers.json b/tests/baselines/fixture/tests/test_diffusers.json index b84e40dab8..cde044dfd2 100644 --- a/tests/baselines/fixture/tests/test_diffusers.json +++ b/tests/baselines/fixture/tests/test_diffusers.json @@ -5,21 +5,33 @@ }, "gaudi2": { "throughput": 0.145 + }, + "gaudi3": { + "throughput": 0.145 } }, "tests/test_diffusers.py::GaudiFluxImg2ImgPipelineTester::test_flux_img2img_inference": { "gaudi2": { "throughput": 0.12 + }, + "gaudi3": { + "throughput": 0.12 } }, "tests/test_diffusers.py::GaudiFluxPipelineTester::test_flux_inference": { "gaudi2": { "throughput": 0.03 + }, + "gaudi3": { + "throughput": 0.03 } }, "tests/test_diffusers.py::GaudiStableDiffusion3PipelineTester::test_sd3_inference": { "gaudi2": { "throughput": 0.006 + }, + "gaudi3": { + "throughput": 0.006 } }, "tests/test_diffusers.py::GaudiStableDiffusionPipelineTester::test_no_generation_regression_ldm3d": { @@ -28,6 +40,9 @@ }, "gaudi2": { "throughput": 0.394 + }, + "gaudi3": { + "throughput": 0.394 } }, "tests/test_diffusers.py::GaudiStableDiffusionPipelineTester::test_no_throughput_regression_autocast": { @@ -36,6 +51,9 @@ }, "gaudi2": { "throughput": 0.394 + }, + "gaudi3": { + "throughput": 0.394 } }, "tests/test_diffusers.py::GaudiStableDiffusionPipelineTester::test_no_throughput_regression_bf16": { @@ -44,6 +62,9 @@ }, "gaudi2": { "throughput": 1.086 + }, + "gaudi3": { + "throughput": 1.086 } }, "tests/test_diffusers.py::GaudiStableDiffusionPipelineTester::test_sd_textual_inversion": { @@ -54,6 +75,10 @@ "gaudi2": { "train_runtime": 1.542460777796805, "train_samples_per_second": 131.7606336456344 + }, + "gaudi3": { + "train_runtime": 1.542460777796805, + "train_samples_per_second": 131.7606336456344 } }, "tests/test_diffusers.py::GaudiStableDiffusionXLPipelineTester::test_sdxl_textual_inversion": { @@ -64,6 +89,10 @@ "gaudi2": { "train_runtime": 74.92, "train_samples_per_second": 2.6694 + }, + "gaudi3": { + "train_runtime": 74.92, + "train_samples_per_second": 2.6694 } }, "tests/test_diffusers.py::GaudiStableDiffusionXLPipelineTester::test_stable_diffusion_xl_generation_throughput": { @@ -72,6 +101,9 @@ }, "gaudi2": { "throughput": 0.301 + }, + "gaudi3": { + "throughput": 0.301 } }, "tests/test_diffusers.py::GaudiStableVideoDiffusionPipelineTester::test_stable_video_diffusion_no_throughput_regression_bf16": { @@ -80,6 +112,9 @@ }, "gaudi2": { "throughput": 0.012 + }, + "gaudi3": { + "throughput": 0.012 } }, "tests/test_diffusers.py::I2VGenXLPipelineTests::test_i2vgen_xl_bf16": { @@ -88,6 +123,9 @@ }, "gaudi2": { "throughput": 0.017 + }, + "gaudi3": { + "throughput": 0.017 } }, "tests/test_diffusers.py::StableDiffusionInpaintPipelineTests::test_stable_diffusion_inpaint_no_throughput_regression": { @@ -96,6 +134,9 @@ }, "gaudi2": { "throughput": 1.025 + }, + "gaudi3": { + "throughput": 1.025 } }, "tests/test_diffusers.py::StableDiffusionXLInpaintPipelineTests::test_stable_diffusion_xl_inpaint_no_throughput_regression": { @@ -104,6 +145,9 @@ }, "gaudi2": { "throughput": 0.175 + }, + "gaudi3": { + "throughput": 0.175 } }, "tests/test_diffusers.py::TrainControlNet::test_train_controlnet": { @@ -114,6 +158,10 @@ "gaudi2": { "train_runtime": 1.8647471838630736, "train_samples_per_second": 120.123522340414 + }, + "gaudi3": { + "train_runtime": 1.8647471838630736, + "train_samples_per_second": 120.123522340414 } } } \ No newline at end of file diff --git a/tests/baselines/fixture/tests/test_encoder_decoder.json b/tests/baselines/fixture/tests/test_encoder_decoder.json index 25c780e5dd..b4196a7766 100644 --- a/tests/baselines/fixture/tests/test_encoder_decoder.json +++ b/tests/baselines/fixture/tests/test_encoder_decoder.json @@ -7,6 +7,10 @@ "gaudi2": { "predict_rougeLsum": 28.9801, "predict_samples_per_second": 4.339 + }, + "gaudi3": { + "predict_rougeLsum": 28.9801, + "predict_samples_per_second": 4.339 } }, "tests/test_encoder_decoder.py::TestEncoderDecoderModels::test_text_summarization_bf16[t5-3b-Habana/t5-2-1]": { @@ -17,6 +21,10 @@ "gaudi2": { "predict_rougeLsum": 21.8877, "predict_samples_per_second": 3.848 + }, + "gaudi3": { + "predict_rougeLsum": 21.8877, + "predict_samples_per_second": 3.848 } }, "tests/test_encoder_decoder.py::TestEncoderDecoderModels::test_text_translation_bf16[t5-small-Habana/t5-2-1]": { @@ -27,6 +35,10 @@ "gaudi2": { "predict_bleu": 11.7277, "predict_samples_per_second": 11.648 + }, + "gaudi3": { + "predict_bleu": 11.7277, + "predict_samples_per_second": 11.648 } } } \ No newline at end of file diff --git a/tests/baselines/fixture/tests/test_examples.json b/tests/baselines/fixture/tests/test_examples.json index d1820727f8..831b0e7dac 100644 --- a/tests/baselines/fixture/tests/test_examples.json +++ b/tests/baselines/fixture/tests/test_examples.json @@ -4,6 +4,11 @@ "perplexity": 26.39, "train_runtime": 356.07, "train_samples_per_second": 14.06 + }, + "gaudi3": { + "perplexity": 26.39, + "train_runtime": 356.07, + "train_samples_per_second": 14.06 } }, "tests/test_examples.py::CausalLanguageModelingLORAExampleTester::test_run_lora_clm_llama-7b_single_card": { @@ -16,6 +21,11 @@ "perplexity": 3.8436, "train_runtime": 113.9713, "train_samples_per_second": 18.428 + }, + "gaudi3": { + "perplexity": 3.8436, + "train_runtime": 113.9713, + "train_samples_per_second": 18.428 } }, "tests/test_examples.py::DeepSpeedTextClassificationExampleTester::test_run_glue_LlamaGuard-7b_deepspeed": { @@ -23,6 +33,11 @@ "eval_f1": 0.8873483535528596, "train_runtime": 62.4539, "train_samples_per_second": 342.169 + }, + "gaudi3": { + "eval_f1": 0.8873483535528596, + "train_runtime": 62.4539, + "train_samples_per_second": 342.169 } }, "tests/test_examples.py::DeepspeedCausalLanguageModelingExampleTester::test_run_clm_CodeLlama-13b-Instruct-hf_deepspeed": { @@ -30,6 +45,11 @@ "perplexity": 6.877496628184696, "train_runtime": 542.2985, "train_samples_per_second": 18.789 + }, + "gaudi3": { + "perplexity": 6.877496628184696, + "train_runtime": 542.2985, + "train_samples_per_second": 18.789 } }, "tests/test_examples.py::DeepspeedCausalLanguageModelingExampleTester::test_run_clm_bloom-7b1_deepspeed": { @@ -43,6 +63,11 @@ "perplexity": 16.51629, "train_runtime": 445, "train_samples_per_second": 18.216 + }, + "gaudi3": { + "perplexity": 16.51629, + "train_runtime": 445, + "train_samples_per_second": 18.216 } }, "tests/test_examples.py::DeepspeedCausalLanguageModelingExampleTester::test_run_clm_gemma-2b-it_deepspeed": { @@ -50,6 +75,11 @@ "perplexity": 924.062, "train_runtime": 75.518, "train_samples_per_second": 81.097 + }, + "gaudi3": { + "perplexity": 924.062, + "train_runtime": 75.518, + "train_samples_per_second": 81.097 } }, "tests/test_examples.py::DeepspeedCausalLanguageModelingExampleTester::test_run_clm_gpt-neox-20b_deepspeed": { @@ -57,6 +87,11 @@ "perplexity": 8.169664686471043, "train_runtime": 445, "train_samples_per_second": 7.328 + }, + "gaudi3": { + "perplexity": 8.169664686471043, + "train_runtime": 445, + "train_samples_per_second": 7.328 } }, "tests/test_examples.py::DeepspeedCausalLanguageModelingExampleTester::test_run_clm_gpt2-xl_deepspeed": { @@ -69,6 +104,11 @@ "perplexity": 13.237754028004865, "train_runtime": 206.5775, "train_samples_per_second": 95.539 + }, + "gaudi3": { + "perplexity": 13.237754028004865, + "train_runtime": 206.5775, + "train_samples_per_second": 95.539 } }, "tests/test_examples.py::DeepspeedSFTExampleTester::test_sft_Qwen2-72B_deepspeed": { @@ -76,6 +116,11 @@ "perplexity": 3.7020898897918824, "train_runtime": 918.8018, "train_samples_per_second": 7.554 + }, + "gaudi3": { + "perplexity": 3.7020898897918824, + "train_runtime": 918.8018, + "train_samples_per_second": 7.554 } }, "tests/test_examples.py::DeepspeedSummarizationExampleTester::test_run_summarization_flan-t5-xxl_deepspeed": { @@ -83,6 +128,11 @@ "eval_rougeLsum": 29.308, "train_runtime": 155.86, "train_samples_per_second": 28.387 + }, + "gaudi3": { + "eval_rougeLsum": 29.308, + "train_runtime": 155.86, + "train_samples_per_second": 28.387 } }, "tests/test_examples.py::EagerModeCausalLanguageModelingExampleTester::test_run_clm_gemma-2b-it_single_card": { @@ -90,6 +140,11 @@ "perplexity": 26.69, "train_runtime": 560.8188, "train_samples_per_second": 8.597 + }, + "gaudi3": { + "perplexity": 26.69, + "train_runtime": 560.8188, + "train_samples_per_second": 8.597 } }, "tests/test_examples.py::ImageClassificationExampleTester::test_run_image_classification_swin-base-patch4-window7-224-in22k_single_card": { @@ -102,6 +157,11 @@ "eval_accuracy": 0.9850666666666666, "train_runtime": 77.8934, "train_samples_per_second": 826.766 + }, + "gaudi3": { + "eval_accuracy": 0.9850666666666666, + "train_runtime": 77.8934, + "train_samples_per_second": 826.766 } }, "tests/test_examples.py::ImageClassificationExampleTester::test_run_image_classification_vit-base-patch16-224-in21k_single_card": { @@ -114,6 +174,11 @@ "eval_accuracy": 0.9690666666666666, "train_runtime": 54.9734, "train_samples_per_second": 870.272 + }, + "gaudi3": { + "eval_accuracy": 0.9690666666666666, + "train_runtime": 54.9734, + "train_samples_per_second": 870.272 } }, "tests/test_examples.py::MultiCardAudioClassificationExampleTester::test_run_audio_classification_ast-finetuned-speech-commands-v2_multi_card": { @@ -122,6 +187,12 @@ "eval_samples_per_second": 2301.088, "train_runtime": 139.9477, "train_samples_per_second": 1955.74 + }, + "gaudi3": { + "eval_accuracy": 0.1871, + "eval_samples_per_second": 2301.088, + "train_runtime": 139.9477, + "train_samples_per_second": 1955.74 } }, "tests/test_examples.py::MultiCardAudioClassificationExampleTester::test_run_audio_classification_wav2vec2-base_multi_card": { @@ -136,12 +207,22 @@ "eval_samples_per_second": 3640.021, "train_runtime": 63.4079, "train_samples_per_second": 2975.844 + }, + "gaudi3": { + "eval_accuracy": 0.7228, + "eval_samples_per_second": 3640.021, + "train_runtime": 63.4079, + "train_samples_per_second": 2975.844 } }, "tests/test_examples.py::MultiCardBridgetowerExampleTester::test_run_bridgetower_bridgetower-large-itm-mlm-itc_multi_card": { "gaudi2": { "train_runtime": 224.42, "train_samples_per_second": 904.93 + }, + "gaudi3": { + "train_runtime": 224.42, + "train_samples_per_second": 904.93 } }, "tests/test_examples.py::MultiCardCausalLanguageModelingAdaloraExampleTester::test_run_lora_clm_llama-7b_multi_card": { @@ -149,6 +230,11 @@ "perplexity": 2.59, "train_runtime": 459, "train_samples_per_second": 107 + }, + "gaudi3": { + "perplexity": 2.59, + "train_runtime": 459, + "train_samples_per_second": 107 } }, "tests/test_examples.py::MultiCardCausalLanguageModelingExampleTester::test_run_clm_gemma-2b-it_multi_card": { @@ -156,6 +242,11 @@ "perplexity": 954.5995, "train_runtime": 82.6617, "train_samples_per_second": 94.524 + }, + "gaudi3": { + "perplexity": 954.5995, + "train_runtime": 82.6617, + "train_samples_per_second": 94.524 } }, "tests/test_examples.py::MultiCardCausalLanguageModelingIA3ExampleTester::test_run_lora_clm_llama-7b_multi_card": { @@ -163,6 +254,11 @@ "perplexity": 3.3, "train_runtime": 262.8, "train_samples_per_second": 161 + }, + "gaudi3": { + "perplexity": 3.3, + "train_runtime": 262.8, + "train_samples_per_second": 161 } }, "tests/test_examples.py::MultiCardCausalLanguageModelingLORAExampleTester2::test_run_lora_clm_falcon-40b_multi_card": { @@ -170,6 +266,11 @@ "perplexity": 1.6, "train_runtime": 710, "train_samples_per_second": 15.0 + }, + "gaudi3": { + "perplexity": 1.6, + "train_runtime": 710, + "train_samples_per_second": 15.0 } }, "tests/test_examples.py::MultiCardCausalLanguageModelingLORAExampleTester2::test_run_lora_clm_llama-7b_multi_card": { @@ -177,6 +278,11 @@ "perplexity": 2.3665, "train_runtime": 294.5707, "train_samples_per_second": 148.093 + }, + "gaudi3": { + "perplexity": 2.3665, + "train_runtime": 294.5707, + "train_samples_per_second": 148.093 } }, "tests/test_examples.py::MultiCardCausalLanguageModelingLORAExampleTester::test_run_lora_clm_falcon-40b_multi_card": { @@ -184,6 +290,11 @@ "perplexity": 4.0, "train_runtime": 550, "train_samples_per_second": 15.0 + }, + "gaudi3": { + "perplexity": 4.0, + "train_runtime": 550, + "train_samples_per_second": 15.0 } }, "tests/test_examples.py::MultiCardCausalLanguageModelingLORAExampleTester::test_run_lora_clm_llama-7b_multi_card": { @@ -196,6 +307,11 @@ "perplexity": 2.3665, "train_runtime": 294.5707, "train_samples_per_second": 148.093 + }, + "gaudi3": { + "perplexity": 2.3665, + "train_runtime": 294.5707, + "train_samples_per_second": 148.093 } }, "tests/test_examples.py::MultiCardCausalLanguageModelingLORAFSDPCompileExampleTester::test_run_lora_clm_llama-7b_multi_card": { @@ -203,6 +319,11 @@ "perplexity": 2.4259, "train_runtime": 186.2483, "train_samples_per_second": 93.5 + }, + "gaudi3": { + "perplexity": 2.4259, + "train_runtime": 186.2483, + "train_samples_per_second": 93.5 } }, "tests/test_examples.py::MultiCardCausalLanguageModelingLlamaAdapterExampleTester::test_run_lora_clm_llama-7b_multi_card": { @@ -210,6 +331,11 @@ "perplexity": 5.575, "train_runtime": 131.7, "train_samples_per_second": 294 + }, + "gaudi3": { + "perplexity": 5.575, + "train_runtime": 131.7, + "train_samples_per_second": 294 } }, "tests/test_examples.py::MultiCardCausalLanguageModelingLnExampleTester::test_run_lora_clm_llama-7b_multi_card": { @@ -217,6 +343,11 @@ "perplexity": 2.83, "train_runtime": 249, "train_samples_per_second": 165 + }, + "gaudi3": { + "perplexity": 2.83, + "train_runtime": 249, + "train_samples_per_second": 165 } }, "tests/test_examples.py::MultiCardCausalLanguageModelingLoRACPExampleTester::test_run_lora_clm_llama-7b_deepspeed": { @@ -224,6 +355,11 @@ "perplexity": 2.8889, "train_runtime": 147.3597, "train_samples_per_second": 34.41 + }, + "gaudi3": { + "perplexity": 2.8889, + "train_runtime": 147.3597, + "train_samples_per_second": 34.41 } }, "tests/test_examples.py::MultiCardCausalLanguageModelingLoRAFP8ExampleTester::test_run_lora_clm_llama-7b_multi_card": { @@ -231,6 +367,11 @@ "perplexity": 2.3692, "train_runtime": 411.9935, "train_samples_per_second": 232.439 + }, + "gaudi3": { + "perplexity": 2.3692, + "train_runtime": 411.9935, + "train_samples_per_second": 232.439 } }, "tests/test_examples.py::MultiCardCausalLanguageModelingPTuningExampleTester::test_run_prompt_tuning_clm_llama-7b_multi_card": { @@ -238,6 +379,11 @@ "perplexity": 1.047, "train_runtime": 18.7, "train_samples_per_second": 63.161 + }, + "gaudi3": { + "perplexity": 1.047, + "train_runtime": 18.7, + "train_samples_per_second": 63.161 } }, "tests/test_examples.py::MultiCardCausalLanguageModelingPrefixTuningExampleTester::test_run_prompt_tuning_clm_llama-7b_multi_card": { @@ -245,6 +391,11 @@ "perplexity": 1.172, "train_runtime": 16.1, "train_samples_per_second": 63.249 + }, + "gaudi3": { + "perplexity": 1.172, + "train_runtime": 16.1, + "train_samples_per_second": 63.249 } }, "tests/test_examples.py::MultiCardCausalLanguageModelingPromptTuningExampleTester::test_run_prompt_tuning_clm_llama-7b_multi_card": { @@ -252,6 +403,11 @@ "perplexity": 1.224, "train_runtime": 16.5, "train_samples_per_second": 63.161 + }, + "gaudi3": { + "perplexity": 1.224, + "train_runtime": 16.5, + "train_samples_per_second": 63.161 } }, "tests/test_examples.py::MultiCardCausalLanguageModelingVeraExampleTester::test_run_lora_clm_llama-7b_multi_card": { @@ -259,12 +415,21 @@ "perplexity": 9.064502567217577, "train_runtime": 312.9258, "train_samples_per_second": 127.305 + }, + "gaudi3": { + "perplexity": 9.064502567217577, + "train_runtime": 312.9258, + "train_samples_per_second": 127.305 } }, "tests/test_examples.py::MultiCardDPOExampleTester::test_dpo_llama-7b_multi_card": { "gaudi2": { "train_runtime": 234.6471, "train_samples_per_second": 13.499 + }, + "gaudi3": { + "train_runtime": 234.6471, + "train_samples_per_second": 13.499 } }, "tests/test_examples.py::MultiCardImageClassificationExampleTester::test_run_image_classification_swin-base-patch4-window7-224-in22k_multi_card": { @@ -277,6 +442,11 @@ "eval_accuracy": 0.9821, "train_runtime": 62.9986, "train_samples_per_second": 6202.525 + }, + "gaudi3": { + "eval_accuracy": 0.9821, + "train_runtime": 62.9986, + "train_samples_per_second": 6202.525 } }, "tests/test_examples.py::MultiCardImageClassificationExampleTester::test_run_image_classification_vit-base-patch16-224-in21k_multi_card": { @@ -289,6 +459,11 @@ "eval_accuracy": 0.9679, "train_runtime": 23.99, "train_samples_per_second": 6718.643 + }, + "gaudi3": { + "eval_accuracy": 0.9679, + "train_runtime": 23.99, + "train_samples_per_second": 6718.643 } }, "tests/test_examples.py::MultiCardImageToTextModelingLoRAExampleTester::test_run_image2text_lora_finetune_Llama-3.2-11B-Vision-Instruct_multi_card": { @@ -296,6 +471,11 @@ "eval_accuracy": 0.6, "train_runtime": 350, "train_samples_per_second": 20.48 + }, + "gaudi3": { + "eval_accuracy": 0.6, + "train_runtime": 350, + "train_samples_per_second": 20.48 } }, "tests/test_examples.py::MultiCardImageToTextModelingLoRAExampleTester::test_run_image2text_lora_finetune_idefics2-8b_multi_card": { @@ -303,6 +483,11 @@ "eval_accuracy": 0.6, "train_runtime": 286, "train_samples_per_second": 11.8 + }, + "gaudi3": { + "eval_accuracy": 0.6, + "train_runtime": 286, + "train_samples_per_second": 11.8 } }, "tests/test_examples.py::MultiCardImageToTextModelingLoRAExampleTester::test_run_image2text_lora_finetune_llava-1.5-7b-hf_multi_card": { @@ -310,6 +495,11 @@ "eval_accuracy": 0.2122, "train_runtime": 118.5782, "train_samples_per_second": 25.146 + }, + "gaudi3": { + "eval_accuracy": 0.2122, + "train_runtime": 118.5782, + "train_samples_per_second": 25.146 } }, "tests/test_examples.py::MultiCardMaskedLanguageModelingExampleTester::test_run_mlm_roberta-large_multi_card": { @@ -322,12 +512,21 @@ "perplexity": 2.829522488584474, "train_runtime": 22.7101, "train_samples_per_second": 1056.875 + }, + "gaudi3": { + "perplexity": 2.829522488584474, + "train_runtime": 22.7101, + "train_samples_per_second": 1056.875 } }, "tests/test_examples.py::MultiCardPPOExampleTester::test_ppo_llama-7b_multi_card": { "gaudi2": { "train_runtime": 62, "train_samples_per_second": 0.5 + }, + "gaudi3": { + "train_runtime": 62, + "train_samples_per_second": 0.5 } }, "tests/test_examples.py::MultiCardProteinFoldingClassificationTester::test_run_sequence_classification_protst-esm1b-for-sequential-classification_multi_card": { @@ -335,6 +534,11 @@ "eval_accuracy": 0.5436668594563332, "train_runtime": 38.9504, "train_samples_per_second": 768.648 + }, + "gaudi3": { + "eval_accuracy": 0.5436668594563332, + "train_runtime": 38.9504, + "train_samples_per_second": 768.648 } }, "tests/test_examples.py::MultiCardQuestionAnsweringExampleTester::test_run_qa_roberta-large_multi_card": { @@ -347,30 +551,51 @@ "eval_f1": 94.09, "train_runtime": 79.333, "train_samples_per_second": 2138.366 + }, + "gaudi3": { + "eval_f1": 94.09, + "train_runtime": 79.333, + "train_samples_per_second": 2138.366 } }, "tests/test_examples.py::MultiCardRewardExampleTester::test_reward_modeling_llama-7b_multi_card": { "gaudi2": { "train_runtime": 250, "train_samples_per_second": 1.6 + }, + "gaudi3": { + "train_runtime": 250, + "train_samples_per_second": 1.6 } }, "tests/test_examples.py::MultiCardSFTChatExampleTester::test_sft_Qwen2-7B_multi_card": { "gaudi2": { "train_runtime": 423.995, "train_samples_per_second": 7.342 + }, + "gaudi3": { + "train_runtime": 423.995, + "train_samples_per_second": 7.342 } }, "tests/test_examples.py::MultiCardSFTChatPeftExampleTester::test_sft_Qwen2-7B_multi_card": { "gaudi2": { "train_runtime": 410, "train_samples_per_second": 120 + }, + "gaudi3": { + "train_runtime": 410, + "train_samples_per_second": 120 } }, "tests/test_examples.py::MultiCardSFTExampleTester::test_sft_llama-7b_multi_card": { "gaudi2": { "train_runtime": 206, "train_samples_per_second": 51.54 + }, + "gaudi3": { + "train_runtime": 206, + "train_samples_per_second": 51.54 } }, "tests/test_examples.py::MultiCardSeq2SeqSpeechRecognitionExampleTester::test_run_speech_recognition_seq2seq_whisper-small_multi_card": { @@ -385,6 +610,12 @@ "eval_wer": 0.4693843594009983, "train_runtime": 380.0, "train_samples_per_second": 218.0 + }, + "gaudi3": { + "eval_samples_per_second": 31.0, + "eval_wer": 0.4693843594009983, + "train_runtime": 380.0, + "train_samples_per_second": 218.0 } }, "tests/test_examples.py::MultiCardSpeechRecognitionExampleTester::test_run_speech_recognition_ctc_wav2vec2-large-lv60_multi_card": { @@ -399,6 +630,12 @@ "eval_wer": 0.1109, "train_runtime": 308.8036, "train_samples_per_second": 225.572 + }, + "gaudi3": { + "eval_samples_per_second": 196.665, + "eval_wer": 0.1109, + "train_runtime": 308.8036, + "train_samples_per_second": 225.572 } }, "tests/test_examples.py::MultiCardTextClassificationExampleTester::test_run_glue_bert-large-uncased-whole-word-masking_multi_card": { @@ -411,6 +648,11 @@ "eval_f1": 0.8452579034941764, "train_runtime": 31.445, "train_samples_per_second": 2845.068 + }, + "gaudi3": { + "eval_f1": 0.8452579034941764, + "train_runtime": 31.445, + "train_samples_per_second": 2845.068 } }, "tests/test_examples.py::QuestionAnsweringExampleTester::test_run_qa_roberta-large_single_card": { @@ -423,6 +665,11 @@ "eval_f1": 94.5886, "train_runtime": 361.4789, "train_samples_per_second": 266.47 + }, + "gaudi3": { + "eval_f1": 94.5886, + "train_runtime": 361.4789, + "train_samples_per_second": 266.47 } }, "tests/test_examples.py::TextClassificationExampleTester::test_run_glue_bert-large-uncased-whole-word-masking_single_card": { @@ -435,6 +682,11 @@ "eval_f1": 0.867, "train_runtime": 33.2909, "train_samples_per_second": 1100.598 + }, + "gaudi3": { + "eval_f1": 0.867, + "train_runtime": 33.2909, + "train_samples_per_second": 1100.598 } } } \ No newline at end of file diff --git a/tests/baselines/fixture/tests/test_feature_extraction.json b/tests/baselines/fixture/tests/test_feature_extraction.json index bf336f6c17..8293e8fed7 100644 --- a/tests/baselines/fixture/tests/test_feature_extraction.json +++ b/tests/baselines/fixture/tests/test_feature_extraction.json @@ -5,6 +5,9 @@ }, "gaudi2": { "time_per_iter": 0.6812 + }, + "gaudi3": { + "time_per_iter": 0.6812 } } } \ No newline at end of file diff --git a/tests/baselines/fixture/tests/test_fp8_examples.json b/tests/baselines/fixture/tests/test_fp8_examples.json index 43aa371fa1..0487cbc1e8 100644 --- a/tests/baselines/fixture/tests/test_fp8_examples.json +++ b/tests/baselines/fixture/tests/test_fp8_examples.json @@ -3,6 +3,10 @@ "gaudi2": { "eval_accuracy": 0.7538, "train_samples_per_second": 12.373 + }, + "gaudi3": { + "eval_accuracy": 0.7538, + "train_samples_per_second": 12.373 } } } \ No newline at end of file diff --git a/tests/baselines/fixture/tests/test_fsdp_examples.json b/tests/baselines/fixture/tests/test_fsdp_examples.json index 834ecba8a6..b9e17c7354 100644 --- a/tests/baselines/fixture/tests/test_fsdp_examples.json +++ b/tests/baselines/fixture/tests/test_fsdp_examples.json @@ -3,12 +3,20 @@ "gaudi2": { "eval_f1": 85.7077, "train_samples_per_second": 2983.533 + }, + "gaudi3": { + "eval_f1": 85.7077, + "train_samples_per_second": 2983.533 } }, "tests/test_fsdp_examples.py::test_fsdp_bf16[meta-llama/Llama-2-7b-hf--language-modeling-8-8-run_lora_clm.py-auto_wrap]": { "gaudi2": { "train_loss": 0.9093, "train_samples_per_second": 85.016 + }, + "gaudi3": { + "train_loss": 0.9093, + "train_samples_per_second": 85.016 } } } \ No newline at end of file diff --git a/tests/baselines/fixture/tests/test_image_classification.json b/tests/baselines/fixture/tests/test_image_classification.json index 28868221e1..cc903834f8 100644 --- a/tests/baselines/fixture/tests/test_image_classification.json +++ b/tests/baselines/fixture/tests/test_image_classification.json @@ -2,6 +2,9 @@ "tests/test_image_classification.py::GaudiFastViTTester::test_no_latency_regression_autocast": { "gaudi2": { "latency": 2.527062664031982 + }, + "gaudi3": { + "latency": 2.527062664031982 } } } \ No newline at end of file diff --git a/tests/baselines/fixture/tests/test_image_segmentation.json b/tests/baselines/fixture/tests/test_image_segmentation.json index 87a0523de8..dbec2bf555 100644 --- a/tests/baselines/fixture/tests/test_image_segmentation.json +++ b/tests/baselines/fixture/tests/test_image_segmentation.json @@ -2,6 +2,9 @@ "tests/test_image_segmentation.py::GaudiSAMTester::test_no_latency_regression_bf16": { "gaudi2": { "latency": 98.92215728759766 + }, + "gaudi3": { + "latency": 98.92215728759766 } } } \ No newline at end of file diff --git a/tests/baselines/fixture/tests/test_image_to_text_example.json b/tests/baselines/fixture/tests/test_image_to_text_example.json index d9bab43d39..e95c6d88d8 100644 --- a/tests/baselines/fixture/tests/test_image_to_text_example.json +++ b/tests/baselines/fixture/tests/test_image_to_text_example.json @@ -2,21 +2,33 @@ "tests/test_image_to_text_example.py::test_image_to_text_bf16[HuggingFaceM4/idefics2-8b-1]": { "gaudi2": { "throughput": 21.89944593215077 + }, + "gaudi3": { + "throughput": 21.89944593215077 } }, "tests/test_image_to_text_example.py::test_image_to_text_bf16[Qwen/Qwen2-VL-2B-Instruct-1]": { "gaudi2": { "throughput": 28.755882208438422 + }, + "gaudi3": { + "throughput": 28.755882208438422 } }, "tests/test_image_to_text_example.py::test_image_to_text_bf16[Qwen/Qwen2-VL-7B-Instruct-1]": { "gaudi2": { "throughput": 19.32562189532818 + }, + "gaudi3": { + "throughput": 19.32562189532818 } }, "tests/test_image_to_text_example.py::test_image_to_text_bf16[google/paligemma-3b-mix-224-1]": { "gaudi2": { "throughput": 132.8949150246155 + }, + "gaudi3": { + "throughput": 132.8949150246155 } }, "tests/test_image_to_text_example.py::test_image_to_text_bf16[llava-hf/llava-1.5-13b-hf-1]": { @@ -25,6 +37,9 @@ }, "gaudi2": { "throughput": 48.54364937033955 + }, + "gaudi3": { + "throughput": 48.54364937033955 } }, "tests/test_image_to_text_example.py::test_image_to_text_bf16[llava-hf/llava-1.5-7b-hf-1]": { @@ -33,6 +48,9 @@ }, "gaudi2": { "throughput": 77.98733740859008 + }, + "gaudi3": { + "throughput": 77.98733740859008 } }, "tests/test_image_to_text_example.py::test_image_to_text_bf16[llava-hf/llava-v1.6-mistral-7b-hf-1]": { @@ -41,6 +59,9 @@ }, "gaudi2": { "throughput": 33.17984878151546 + }, + "gaudi3": { + "throughput": 33.17984878151546 } }, "tests/test_image_to_text_example.py::test_image_to_text_bf16[llava-hf/llava-v1.6-vicuna-13b-hf-1]": { @@ -49,46 +70,73 @@ }, "gaudi2": { "throughput": 23.527610042925 + }, + "gaudi3": { + "throughput": 23.527610042925 } }, "tests/test_image_to_text_example.py::test_image_to_text_bf16[llava-hf/llava-v1.6-vicuna-7b-hf-1]": { "gaudi2": { "throughput": 35.00608681379742 + }, + "gaudi3": { + "throughput": 35.00608681379742 } }, "tests/test_image_to_text_example.py::test_image_to_text_bf16[meta-llama/Llama-3.2-11B-Vision-Instruct-1]": { "gaudi2": { "throughput": 18.974541922240313 + }, + "gaudi3": { + "throughput": 18.974541922240313 } }, "tests/test_image_to_text_example.py::test_image_to_text_bf16[tiiuae/falcon-11B-vlm-1]": { "gaudi2": { "throughput": 23.69260849957278 + }, + "gaudi3": { + "throughput": 23.69260849957278 } }, "tests/test_image_to_text_example.py::test_image_to_text_fp8[llava-hf/llava-1.5-13b-hf-1]": { "gaudi2": { "throughput": 67.20488222876344 + }, + "gaudi3": { + "throughput": 67.20488222876344 } }, "tests/test_image_to_text_example.py::test_image_to_text_fp8[llava-hf/llava-1.5-7b-hf-1]": { "gaudi2": { "throughput": 98.72578382705062 + }, + "gaudi3": { + "throughput": 98.72578382705062 } }, "tests/test_image_to_text_example.py::test_image_to_text_fp8[llava-hf/llava-v1.6-mistral-7b-hf-1]": { "gaudi2": { "throughput": 45.011551008367086 + }, + "gaudi3": { + "throughput": 45.011551008367086 } }, "tests/test_image_to_text_example.py::test_image_to_text_fp8[llava-hf/llava-v1.6-vicuna-13b-hf-1]": { "gaudi2": { "throughput": 30.9535718774675 + }, + "gaudi3": { + "throughput": 30.9535718774675 } }, "tests/test_image_to_text_example.py::test_image_to_text_fp8[llava-hf/llava-v1.6-vicuna-7b-hf-1]": { "gaudi2": { "throughput": 45.18544502949674 + }, + "gaudi3": { + "throughput": 45.18544502949674 } } } \ No newline at end of file diff --git a/tests/baselines/fixture/tests/test_object_detection.json b/tests/baselines/fixture/tests/test_object_detection.json index 176a27036a..c1c93b6c52 100644 --- a/tests/baselines/fixture/tests/test_object_detection.json +++ b/tests/baselines/fixture/tests/test_object_detection.json @@ -5,6 +5,9 @@ }, "gaudi2": { "latency": 7.0 + }, + "gaudi3": { + "latency": 7.0 } }, "tests/test_object_detection.py::GaudiDetrResnet50_Tester::test_no_latency_regression_autocast": { @@ -13,6 +16,9 @@ }, "gaudi2": { "latency": 7.0 + }, + "gaudi3": { + "latency": 7.0 } } } \ No newline at end of file diff --git a/tests/baselines/fixture/tests/test_object_segmentation.json b/tests/baselines/fixture/tests/test_object_segmentation.json index 87b9ac28dc..65ae50ea0f 100644 --- a/tests/baselines/fixture/tests/test_object_segmentation.json +++ b/tests/baselines/fixture/tests/test_object_segmentation.json @@ -2,6 +2,9 @@ "tests/test_object_segmentation.py::GaudiClipSegTester::test_no_latency_regression_autocast": { "gaudi2": { "latency": 5.3107380867004395 + }, + "gaudi3": { + "latency": 5.3107380867004395 } } } \ No newline at end of file diff --git a/tests/baselines/fixture/tests/test_openclip_vqa.json b/tests/baselines/fixture/tests/test_openclip_vqa.json index 91f9d7d601..2daee462ac 100644 --- a/tests/baselines/fixture/tests/test_openclip_vqa.json +++ b/tests/baselines/fixture/tests/test_openclip_vqa.json @@ -5,6 +5,9 @@ }, "gaudi2": { "throughput": 1472 + }, + "gaudi3": { + "throughput": 1472 } }, "tests/test_openclip_vqa.py::test_openclip_vqa_bf16[microsoft/BiomedCLIP-PubMedBERT_256-vit_base_patch16_224]": { @@ -13,6 +16,9 @@ }, "gaudi2": { "throughput": 1816 + }, + "gaudi3": { + "throughput": 1816 } } } \ No newline at end of file diff --git a/tests/baselines/fixture/tests/test_sentence_transformers.json b/tests/baselines/fixture/tests/test_sentence_transformers.json index 23f4f6af97..dfa5753e50 100644 --- a/tests/baselines/fixture/tests/test_sentence_transformers.json +++ b/tests/baselines/fixture/tests/test_sentence_transformers.json @@ -5,6 +5,9 @@ }, "gaudi2": { "measured_throughput": 3614.2610109716247 + }, + "gaudi3": { + "measured_throughput": 3614.2610109716247 } }, "tests/test_sentence_transformers.py::test_compute_embeddings_throughput[sentence-transformers/all-MiniLM-L6-v2]": { @@ -13,6 +16,9 @@ }, "gaudi2": { "measured_throughput": 2615.6975354038477 + }, + "gaudi3": { + "measured_throughput": 2615.6975354038477 } }, "tests/test_sentence_transformers.py::test_compute_embeddings_throughput[sentence-transformers/all-distilroberta-v1]": { @@ -21,6 +27,9 @@ }, "gaudi2": { "measured_throughput": 958.5097903298335 + }, + "gaudi3": { + "measured_throughput": 958.5097903298335 } }, "tests/test_sentence_transformers.py::test_compute_embeddings_throughput[sentence-transformers/all-mpnet-base-v2]": { @@ -29,6 +38,9 @@ }, "gaudi2": { "measured_throughput": 762.5595168883357 + }, + "gaudi3": { + "measured_throughput": 762.5595168883357 } }, "tests/test_sentence_transformers.py::test_compute_embeddings_throughput[sentence-transformers/distiluse-base-multilingual-cased-v1]": { @@ -37,6 +49,9 @@ }, "gaudi2": { "measured_throughput": 3487.3319366004903 + }, + "gaudi3": { + "measured_throughput": 3487.3319366004903 } }, "tests/test_sentence_transformers.py::test_compute_embeddings_throughput[sentence-transformers/distiluse-base-multilingual-cased-v2]": { @@ -45,6 +60,9 @@ }, "gaudi2": { "measured_throughput": 3807.2486282025716 + }, + "gaudi3": { + "measured_throughput": 3807.2486282025716 } }, "tests/test_sentence_transformers.py::test_compute_embeddings_throughput[sentence-transformers/multi-qa-MiniLM-L6-cos-v1]": { @@ -53,6 +71,9 @@ }, "gaudi2": { "measured_throughput": 1208.3672807492396 + }, + "gaudi3": { + "measured_throughput": 1208.3672807492396 } }, "tests/test_sentence_transformers.py::test_compute_embeddings_throughput[sentence-transformers/multi-qa-distilbert-cos-v1]": { @@ -61,6 +82,9 @@ }, "gaudi2": { "measured_throughput": 944.6166139694299 + }, + "gaudi3": { + "measured_throughput": 944.6166139694299 } }, "tests/test_sentence_transformers.py::test_compute_embeddings_throughput[sentence-transformers/multi-qa-mpnet-base-dot-v1]": { @@ -69,6 +93,9 @@ }, "gaudi2": { "measured_throughput": 545.3360251829846 + }, + "gaudi3": { + "measured_throughput": 545.3360251829846 } }, "tests/test_sentence_transformers.py::test_compute_embeddings_throughput[sentence-transformers/paraphrase-MiniLM-L3-v2]": { @@ -77,6 +104,9 @@ }, "gaudi2": { "measured_throughput": 5734.318427972881 + }, + "gaudi3": { + "measured_throughput": 5734.318427972881 } }, "tests/test_sentence_transformers.py::test_compute_embeddings_throughput[sentence-transformers/paraphrase-albert-small-v2]": { @@ -85,6 +115,9 @@ }, "gaudi2": { "measured_throughput": 3896.1911011860166 + }, + "gaudi3": { + "measured_throughput": 3896.1911011860166 } }, "tests/test_sentence_transformers.py::test_compute_embeddings_throughput[sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2]": { @@ -93,6 +126,9 @@ }, "gaudi2": { "measured_throughput": 3558.0778715789693 + }, + "gaudi3": { + "measured_throughput": 3558.0778715789693 } }, "tests/test_sentence_transformers.py::test_compute_embeddings_throughput[sentence-transformers/paraphrase-multilingual-mpnet-base-v2]": { @@ -101,6 +137,9 @@ }, "gaudi2": { "measured_throughput": 2392.1654748794062 + }, + "gaudi3": { + "measured_throughput": 2392.1654748794062 } } } \ No newline at end of file diff --git a/tests/baselines/fixture/tests/test_table_transformer.json b/tests/baselines/fixture/tests/test_table_transformer.json index 7e1b6cee61..873efedbe6 100644 --- a/tests/baselines/fixture/tests/test_table_transformer.json +++ b/tests/baselines/fixture/tests/test_table_transformer.json @@ -5,6 +5,9 @@ }, "gaudi2": { "latency": 2.2 + }, + "gaudi3": { + "latency": 2.2 } } } \ No newline at end of file diff --git a/tests/baselines/fixture/tests/test_text_generation_example.json b/tests/baselines/fixture/tests/test_text_generation_example.json index de9b3f1014..2915b129e1 100644 --- a/tests/baselines/fixture/tests/test_text_generation_example.json +++ b/tests/baselines/fixture/tests/test_text_generation_example.json @@ -2,26 +2,41 @@ "tests/test_text_generation_example.py::test_text_generation_awq[TheBloke/Llama-2-7b-Chat-AWQ-1-10-False-128-2048]": { "gaudi2": { "throughput": 456.7 + }, + "gaudi3": { + "throughput": 456.7 } }, "tests/test_text_generation_example.py::test_text_generation_beam_search[Qwen/Qwen2-7b-Instruct-1-True]": { "gaudi2": { "throughput": 91.24938949709826 + }, + "gaudi3": { + "throughput": 91.24938949709826 } }, "tests/test_text_generation_example.py::test_text_generation_bf16_1x[CohereForAI/c4ai-command-r-v01-1-False-False]": { "gaudi2": { "throughput": 29.50315234651154 + }, + "gaudi3": { + "throughput": 29.50315234651154 } }, "tests/test_text_generation_example.py::test_text_generation_bf16_1x[Deci/DeciLM-7B-1-False-False]": { "gaudi2": { "throughput": 115 + }, + "gaudi3": { + "throughput": 115 } }, "tests/test_text_generation_example.py::test_text_generation_bf16_1x[EleutherAI/gpt-j-6b-1-False-False]": { "gaudi2": { "throughput": 160.5823842101192 + }, + "gaudi3": { + "throughput": 160.5823842101192 } }, "tests/test_text_generation_example.py::test_text_generation_bf16_1x[EleutherAI/gpt-j-6b-1-True-False]": { @@ -32,11 +47,17 @@ "tests/test_text_generation_example.py::test_text_generation_bf16_1x[EleutherAI/gpt-neo-2.7B-1-False-False]": { "gaudi2": { "throughput": 257.2476416844122 + }, + "gaudi3": { + "throughput": 257.2476416844122 } }, "tests/test_text_generation_example.py::test_text_generation_bf16_1x[EleutherAI/gpt-neox-20b-1-False-False]": { "gaudi2": { "throughput": 50.67672679310354 + }, + "gaudi3": { + "throughput": 50.67672679310354 } }, "tests/test_text_generation_example.py::test_text_generation_bf16_1x[Qwen/Qwen1.5-7B-1-False-False]": { @@ -47,22 +68,35 @@ "tests/test_text_generation_example.py::test_text_generation_bf16_1x[Qwen/Qwen1.5-7B-4-False-False]": { "gaudi2": { "throughput": 490.8621617893209 + }, + "gaudi3": { + "throughput": 490.8621617893209 } }, "tests/test_text_generation_example.py::test_text_generation_bf16_1x[Qwen/Qwen1.5-MoE-A2.7B-1-True-False]": { "gaudi2": { "throughput": 44.25834541569395 + }, + "gaudi3": { + "throughput": 44.25834541569395 } }, "tests/test_text_generation_example.py::test_text_generation_bf16_1x[Qwen/Qwen2-7B-256-False-True]": { "gaudi2": { "output": "DeepSpeed is a machine learning framework that provides a unified interface for training deep learning models. It is designed to be easy to use and to provide high performance. DeepSpeed is built on top of PyTorch and TensorFlow, and it supports a wide range of models, including transformers, convolutional neural networks, and recurrent neural networks.\nDeepSpeed is a machine learning framework that provides a unified interface for training deep learning models. It is designed to be easy to use and to provide high performance. DeepSpeed is built on top of Py", "throughput": 8870.945160540245 + }, + "gaudi3": { + "output": "DeepSpeed is a machine learning framework that provides a unified interface for training deep learning models. It is designed to be easy to use and to provide high performance. DeepSpeed is built on top of PyTorch and TensorFlow, and it supports a wide range of models, including transformers, convolutional neural networks, and recurrent neural networks.\nDeepSpeed is a machine learning framework that provides a unified interface for training deep learning models. It is designed to be easy to use and to provide high performance. DeepSpeed is built on top of Py", + "throughput": 8870.945160540245 } }, "tests/test_text_generation_example.py::test_text_generation_bf16_1x[Qwen/Qwen2.5-7B-4-False-False]": { "gaudi2": { "throughput": 490 + }, + "gaudi3": { + "throughput": 490 } }, "tests/test_text_generation_example.py::test_text_generation_bf16_1x[Salesforce/codegen2-1B-1-False-False]": { @@ -71,16 +105,25 @@ }, "gaudi2": { "throughput": 446.4029486883532 + }, + "gaudi3": { + "throughput": 446.4029486883532 } }, "tests/test_text_generation_example.py::test_text_generation_bf16_1x[THUDM/chatglm2-6b-1-True-False]": { "gaudi2": { "throughput": 150 + }, + "gaudi3": { + "throughput": 150 } }, "tests/test_text_generation_example.py::test_text_generation_bf16_1x[THUDM/chatglm3-6b-1-True-False]": { "gaudi2": { "throughput": 150 + }, + "gaudi3": { + "throughput": 150 } }, "tests/test_text_generation_example.py::test_text_generation_bf16_1x[adept/persimmon-8b-base-1-False-False]": { @@ -91,16 +134,25 @@ "tests/test_text_generation_example.py::test_text_generation_bf16_1x[adept/persimmon-8b-base-4-False-False]": { "gaudi2": { "throughput": 366.73968820698406 + }, + "gaudi3": { + "throughput": 366.73968820698406 } }, "tests/test_text_generation_example.py::test_text_generation_bf16_1x[baichuan-inc/Baichuan2-13B-Chat-1-False-False]": { "gaudi2": { "throughput": 66 + }, + "gaudi3": { + "throughput": 66 } }, "tests/test_text_generation_example.py::test_text_generation_bf16_1x[baichuan-inc/Baichuan2-7B-Chat-1-True-False]": { "gaudi2": { "throughput": 108 + }, + "gaudi3": { + "throughput": 108 } }, "tests/test_text_generation_example.py::test_text_generation_bf16_1x[bigcode/starcoder-1-False-False]": { @@ -112,6 +164,10 @@ "gaudi2": { "output": "def print_hello_world():\n print(\"Hello World\")\n\ndef print_hello_world_twice():\n print_hello_world()\n print_hello_world()\n\ndef print_hello_world_thrice():\n print_hello_world()\n print_hello_world()\n print_hello_world()\n\ndef print_hello_world_four_times():\n print_hello_world()\n print_hello_world()\n print_hello_world()\n ", "throughput": 6846.575763562658 + }, + "gaudi3": { + "output": "def print_hello_world():\n print(\"Hello World\")\n\ndef print_hello_world_twice():\n print_hello_world()\n print_hello_world()\n\ndef print_hello_world_thrice():\n print_hello_world()\n print_hello_world()\n print_hello_world()\n\ndef print_hello_world_four_times():\n print_hello_world()\n print_hello_world()\n print_hello_world()\n ", + "throughput": 6846.575763562658 } }, "tests/test_text_generation_example.py::test_text_generation_bf16_1x[bigcode/starcoder2-3b-1-False-False]": { @@ -123,6 +179,10 @@ "gaudi2": { "output": "def print_hello_world():\n print(\"Hello World\")\n\ndef print_hello_world_with_name(name):\n print(\"Hello World, \" + name)\n\ndef print_hello_world_with_name_and_age(name, age):\n print(\"Hello World, \" + name + \", \" + str(age))\n\ndef print_hello_world_with_name_and_age_and_gender(name, age, gender):\n print(\"Hello", "throughput": 261.07213776344133 + }, + "gaudi3": { + "output": "def print_hello_world():\n print(\"Hello World\")\n\ndef print_hello_world_with_name(name):\n print(\"Hello World, \" + name)\n\ndef print_hello_world_with_name_and_age(name, age):\n print(\"Hello World, \" + name + \", \" + str(age))\n\ndef print_hello_world_with_name_and_age_and_gender(name, age, gender):\n print(\"Hello", + "throughput": 261.07213776344133 } }, "tests/test_text_generation_example.py::test_text_generation_bf16_1x[bigscience/bloomz-7b1-1-False-False]": { @@ -131,33 +191,53 @@ }, "gaudi2": { "throughput": 130.0472971205316 + }, + "gaudi3": { + "throughput": 130.0472971205316 } }, "tests/test_text_generation_example.py::test_text_generation_bf16_1x[codellama/CodeLlama-34b-hf-1-True-False]": { "gaudi2": { "throughput": 32.644 + }, + "gaudi3": { + "throughput": 32.644 } }, "tests/test_text_generation_example.py::test_text_generation_bf16_1x[deepseek-ai/DeepSeek-V2-Lite-1-False-False]": { "gaudi2": { "throughput": 35 + }, + "gaudi3": { + "throughput": 35 } }, "tests/test_text_generation_example.py::test_text_generation_bf16_1x[facebook/xglm-1.7B-1-False-False]": { "gaudi2": { "throughput": 357.46365062825083 + }, + "gaudi3": { + "throughput": 357.46365062825083 } }, "tests/test_text_generation_example.py::test_text_generation_bf16_1x[google/gemma-2-27b-1-False-True]": { "gaudi2": { "output": "DeepSpeed is a machine learning framework that enables you to train models with trillions of parameters and beyond, using model parallelism to partition large models over multiple GPUs.\n\nThe following is a brief introduction to the DeepSpeed model parallel training.\n\n

1. Introduction

\n\nThe DeepSpeed model parallel training is a simple and effective way to train large models. It is a framework that enables you to train models with trillions of parameters and beyond.\n\nDeepSpeed is a distributed deep learning optimization toolkit that makes it easy and efficient", "throughput": 36.578709544111 + }, + "gaudi3": { + "output": "DeepSpeed is a machine learning framework that enables you to train models with trillions of parameters and beyond, using model parallelism to partition large models over multiple GPUs.\n\nThe following is a brief introduction to the DeepSpeed model parallel training.\n\n

1. Introduction

\n\nThe DeepSpeed model parallel training is a simple and effective way to train large models. It is a framework that enables you to train models with trillions of parameters and beyond.\n\nDeepSpeed is a distributed deep learning optimization toolkit that makes it easy and efficient", + "throughput": 36.578709544111 } }, "tests/test_text_generation_example.py::test_text_generation_bf16_1x[google/gemma-2-9b-1-False-True]": { "gaudi2": { "output": "DeepSpeed is a machine learning framework that enables training of large-scale deep learning models on a single GPU or across multiple GPUs. It is designed to be easy to use and highly scalable, making it a powerful tool for researchers and practitioners working with large-scale deep learning models.\n\nDeepSpeed is built on top of PyTorch, a popular deep learning framework, and provides a set of tools and libraries that make it easy to train large-scale models. It includes features such as zero-shot inference, which allows models to be", "throughput": 92.302359446567 + }, + "gaudi3": { + "output": "DeepSpeed is a machine learning framework that enables training of large-scale deep learning models on a single GPU or across multiple GPUs. It is designed to be easy to use and highly scalable, making it a powerful tool for researchers and practitioners working with large-scale deep learning models.\n\nDeepSpeed is built on top of PyTorch, a popular deep learning framework, and provides a set of tools and libraries that make it easy to train large-scale models. It includes features such as zero-shot inference, which allows models to be", + "throughput": 92.302359446567 } }, "tests/test_text_generation_example.py::test_text_generation_bf16_1x[google/gemma-7b-1-False-False]": { @@ -169,6 +249,10 @@ "gaudi2": { "output": "DeepSpeed is a machine learning framework that enables training of large-scale models on commodity hardware. It is designed to be a drop-in replacement for PyTorch, and it is compatible with the existing PyTorch ecosystem. DeepSpeed is designed to be easy to use, and it provides a number of features that make it easy to train large-scale models. DeepSpeed is designed to be scalable, and it can be used to train models on a single machine or on a cluster of machines. DeepSpeed is designed to be efficient,", "throughput": 109.70751574382221 + }, + "gaudi3": { + "output": "DeepSpeed is a machine learning framework that enables training of large-scale models on commodity hardware. It is designed to be a drop-in replacement for PyTorch, and it is compatible with the existing PyTorch ecosystem. DeepSpeed is designed to be easy to use, and it provides a number of features that make it easy to train large-scale models. DeepSpeed is designed to be scalable, and it can be used to train models on a single machine or on a cluster of machines. DeepSpeed is designed to be efficient,", + "throughput": 109.70751574382221 } }, "tests/test_text_generation_example.py::test_text_generation_bf16_1x[gpt2-xl-1-False-False]": { @@ -177,6 +261,9 @@ }, "gaudi2": { "throughput": 281.8734689674413 + }, + "gaudi3": { + "throughput": 281.8734689674413 } }, "tests/test_text_generation_example.py::test_text_generation_bf16_1x[meta-llama/Llama-2-7b-hf-1-True-False]": { @@ -188,21 +275,34 @@ "gaudi2": { "output": "DeepSpeed is a machine learning framework for deep learning. It is designed to be fast and efficient, while also being easy to use. DeepSpeed is based on the TensorFlow framework, and it uses the TensorFlow library to perform computations.\nDeepSpeed is a deep learning framework that is designed to be fast and efficient. It is based on the TensorFlow library and uses the TensorFlow library to perform computations. DeepSpeed is designed to be easy to use and to provide a high level of flex", "throughput": 141.25776956002076 + }, + "gaudi3": { + "output": "DeepSpeed is a machine learning framework for deep learning. It is designed to be fast and efficient, while also being easy to use. DeepSpeed is based on the TensorFlow framework, and it uses the TensorFlow library to perform computations.\nDeepSpeed is a deep learning framework that is designed to be fast and efficient. It is based on the TensorFlow library and uses the TensorFlow library to perform computations. DeepSpeed is designed to be easy to use and to provide a high level of flex", + "throughput": 141.25776956002076 } }, "tests/test_text_generation_example.py::test_text_generation_bf16_1x[meta-llama/Llama-2-7b-hf-512-False-False]": { "gaudi2": { "throughput": 8711 + }, + "gaudi3": { + "throughput": 8711 } }, "tests/test_text_generation_example.py::test_text_generation_bf16_1x[meta-llama/Llama-2-7b-hf-512-True-False]": { "gaudi2": { "throughput": 12808 + }, + "gaudi3": { + "throughput": 12808 } }, "tests/test_text_generation_example.py::test_text_generation_bf16_1x[meta-llama/Meta-Llama-3-8B-1-True-False]": { "gaudi2": { "throughput": 129 + }, + "gaudi3": { + "throughput": 129 } }, "tests/test_text_generation_example.py::test_text_generation_bf16_1x[microsoft/phi-2-1-False-False]": { @@ -211,6 +311,9 @@ }, "gaudi2": { "throughput": 224.72307766211117 + }, + "gaudi3": { + "throughput": 224.72307766211117 } }, "tests/test_text_generation_example.py::test_text_generation_bf16_1x[mistralai/Mistral-7B-v0.1-1-True-False]": { @@ -222,17 +325,28 @@ "gaudi2": { "output": "DeepSpeed is a machine learning framework that accelerates training of large models on a single machine or distributed systems. It is designed to be compatible with PyTorch and TensorFlow, and can be used to train models on a single machine or on a distributed system.\n\nDeepSpeed is a machine learning framework that accelerates training of large models on a single machine or distributed systems. It is designed to be compatible with PyTorch and TensorFlow, and can be used to train models on a single machine or on a distributed system", "throughput": 130.2172236767782 + }, + "gaudi3": { + "output": "DeepSpeed is a machine learning framework that accelerates training of large models on a single machine or distributed systems. It is designed to be compatible with PyTorch and TensorFlow, and can be used to train models on a single machine or on a distributed system.\n\nDeepSpeed is a machine learning framework that accelerates training of large models on a single machine or distributed systems. It is designed to be compatible with PyTorch and TensorFlow, and can be used to train models on a single machine or on a distributed system", + "throughput": 130.2172236767782 } }, "tests/test_text_generation_example.py::test_text_generation_bf16_1x[mistralai/Mixtral-8x7B-v0.1-1-False-True]": { "gaudi2": { "output": "DeepSpeed is a machine learning framework that enables training of large models on a single machine with a single GPU. It is designed to be easy to use and efficient, and it can be used to train models on a variety of tasks.\n\n## Introduction\n\nDeepSpeed is a machine learning framework that enables training of large models on a single machine with a single GPU. It is designed to be easy to use and efficient, and it can be used to train models on a variety of tasks.\n\n## What is DeepSpeed", "throughput": 23.7931001677926 + }, + "gaudi3": { + "output": "DeepSpeed is a machine learning framework that enables training of large models on a single machine with a single GPU. It is designed to be easy to use and efficient, and it can be used to train models on a variety of tasks.\n\n## Introduction\n\nDeepSpeed is a machine learning framework that enables training of large models on a single machine with a single GPU. It is designed to be easy to use and efficient, and it can be used to train models on a variety of tasks.\n\n## What is DeepSpeed", + "throughput": 23.7931001677926 } }, "tests/test_text_generation_example.py::test_text_generation_bf16_1x[mosaicml/mpt-30b-1-False-False]": { "gaudi2": { "throughput": 36.06464336116623 + }, + "gaudi3": { + "throughput": 36.06464336116623 } }, "tests/test_text_generation_example.py::test_text_generation_bf16_1x[mosaicml/mpt-7b-1-False-False]": { @@ -243,6 +357,9 @@ "tests/test_text_generation_example.py::test_text_generation_bf16_1x[openbmb/MiniCPM3-4B-1-False-False]": { "gaudi2": { "throughput": 65.116 + }, + "gaudi3": { + "throughput": 65.116 } }, "tests/test_text_generation_example.py::test_text_generation_bf16_1x[stabilityai/stablelm-2-12b-1-False-False]": { @@ -251,11 +368,17 @@ }, "gaudi2": { "throughput": 74.8904496532218 + }, + "gaudi3": { + "throughput": 74.8904496532218 } }, "tests/test_text_generation_example.py::test_text_generation_bf16_1x[state-spaces/mamba-130m-hf-1536-False-False]": { "gaudi2": { "throughput": 5385.511100161605 + }, + "gaudi3": { + "throughput": 5385.511100161605 } }, "tests/test_text_generation_example.py::test_text_generation_bf16_1x[state-spaces/mamba-130m-hf-224-False-False]": { @@ -266,6 +389,9 @@ "tests/test_text_generation_example.py::test_text_generation_bf16_1x[tiiuae/falcon-40b-1-True-False]": { "gaudi2": { "throughput": 25.202450111088346 + }, + "gaudi3": { + "throughput": 25.202450111088346 } }, "tests/test_text_generation_example.py::test_text_generation_bf16_1x[tiiuae/falcon-7b-1-True-False]": { @@ -276,6 +402,9 @@ "tests/test_text_generation_example.py::test_text_generation_bf16_1x[tiiuae/falcon-mamba-7b-1-False-False]": { "gaudi2": { "throughput": 47.1464839567739 + }, + "gaudi3": { + "throughput": 47.1464839567739 } }, "tests/test_text_generation_example.py::test_text_generation_contrastive_search[gpt2-xl-1-False]": { @@ -284,11 +413,17 @@ }, "gaudi2": { "throughput": 51.61471298016438 + }, + "gaudi3": { + "throughput": 51.61471298016438 } }, "tests/test_text_generation_example.py::test_text_generation_deepspeed[Qwen/Qwen2.5-72B-2-1]": { "gaudi2": { "throughput": 26 + }, + "gaudi3": { + "throughput": 26 } }, "tests/test_text_generation_example.py::test_text_generation_deepspeed[bigscience/bloomz-7b1-8-1]": { @@ -299,146 +434,233 @@ "tests/test_text_generation_example.py::test_text_generation_deepspeed[bigscience/bloomz-8-1]": { "gaudi2": { "throughput": 36.77314954096159 + }, + "gaudi3": { + "throughput": 36.77314954096159 } }, "tests/test_text_generation_example.py::test_text_generation_deepspeed[facebook/opt-66b-2-1]": { "gaudi2": { "throughput": 28.48069266504111 + }, + "gaudi3": { + "throughput": 28.48069266504111 } }, "tests/test_text_generation_example.py::test_text_generation_deepspeed[google/gemma-2-27b-8-1]": { "gaudi2": { "throughput": 87.578709544111 + }, + "gaudi3": { + "throughput": 87.578709544111 } }, "tests/test_text_generation_example.py::test_text_generation_deepspeed[google/gemma-2-9b-8-1]": { "gaudi2": { "throughput": 110.12610917383735 + }, + "gaudi3": { + "throughput": 110.12610917383735 } }, "tests/test_text_generation_example.py::test_text_generation_deepspeed[meta-llama/Llama-2-70b-hf-8-1]": { "gaudi2": { "throughput": 64.10514998902435 + }, + "gaudi3": { + "throughput": 64.10514998902435 } }, "tests/test_text_generation_example.py::test_text_generation_deepspeed[meta-llama/Meta-Llama-3-70B-Instruct-8-1]": { "gaudi2": { "throughput": 64 + }, + "gaudi3": { + "throughput": 64 } }, "tests/test_text_generation_example.py::test_text_generation_distributed_tp[meta-llama/Llama-2-7b-hf]": { "gaudi2": { "throughput": 1345.2369318328463 + }, + "gaudi3": { + "throughput": 1345.2369318328463 } }, "tests/test_text_generation_example.py::test_text_generation_fp8[meta-llama/Llama-2-70b-hf-4-207-False-2048-128]": { "gaudi2": { "throughput": 568.5 + }, + "gaudi3": { + "throughput": 568.5 } }, "tests/test_text_generation_example.py::test_text_generation_fp8[meta-llama/Llama-2-70b-hf-4-3042-False-128-128]": { "gaudi2": { "throughput": 5374.6 + }, + "gaudi3": { + "throughput": 5374.6 } }, "tests/test_text_generation_example.py::test_text_generation_fp8[meta-llama/Llama-2-70b-hf-4-750-False-128-2048]": { "gaudi2": { "throughput": 7422.4 + }, + "gaudi3": { + "throughput": 7422.4 } }, "tests/test_text_generation_example.py::test_text_generation_fp8[meta-llama/Llama-2-70b-hf-8-172-False-2048-2048]": { "gaudi2": { "throughput": 4656.2 + }, + "gaudi3": { + "throughput": 4656.2 } }, "tests/test_text_generation_example.py::test_text_generation_fp8[meta-llama/Llama-2-7b-hf-1-1230-False-128-128]": { "gaudi2": { "throughput": 13152.7 + }, + "gaudi3": { + "throughput": 13152.7 } }, "tests/test_text_generation_example.py::test_text_generation_fp8[meta-llama/Llama-2-7b-hf-1-163-False-128-2048]": { "gaudi2": { "throughput": 4774.7 + }, + "gaudi3": { + "throughput": 4774.7 } }, "tests/test_text_generation_example.py::test_text_generation_fp8[meta-llama/Llama-2-7b-hf-1-81-False-2048-2048]": { "gaudi2": { "throughput": 1942.9 + }, + "gaudi3": { + "throughput": 1942.9 } }, "tests/test_text_generation_example.py::test_text_generation_fp8[meta-llama/Llama-2-7b-hf-1-94-False-2048-128]": { "gaudi2": { "throughput": 1293.3 + }, + "gaudi3": { + "throughput": 1293.3 } }, "tests/test_text_generation_example.py::test_text_generation_fp8[microsoft/phi-2-1-1-True-128-128]": { "gaudi2": { "throughput": 254.08932787178165 + }, + "gaudi3": { + "throughput": 254.08932787178165 } }, "tests/test_text_generation_example.py::test_text_generation_fp8[mistralai/Mistral-7B-Instruct-v0.2-1-120-True-128-2048]": { "gaudi2": { "throughput": 6979.225194247115 + }, + "gaudi3": { + "throughput": 6979.225194247115 } }, "tests/test_text_generation_example.py::test_text_generation_fp8[mistralai/Mistral-7B-Instruct-v0.2-1-120-True-2048-128]": { "gaudi2": { "throughput": 1681.4401450088983 + }, + "gaudi3": { + "throughput": 1681.4401450088983 } }, "tests/test_text_generation_example.py::test_text_generation_fp8[mistralai/Mistral-7B-Instruct-v0.2-1-44-True-2048-2048]": { "gaudi2": { "throughput": 3393.149396451692 + }, + "gaudi3": { + "throughput": 3393.149396451692 } }, "tests/test_text_generation_example.py::test_text_generation_fp8[mistralai/Mistral-7B-Instruct-v0.2-1-896-True-128-128]": { "gaudi2": { "throughput": 17068.965283763682 + }, + "gaudi3": { + "throughput": 17068.965283763682 } }, "tests/test_text_generation_example.py::test_text_generation_fp8[mistralai/Mixtral-8x7B-v0.1-1-1-True-128-128]": { "gaudi2": { "throughput": 40.94 + }, + "gaudi3": { + "throughput": 40.94 } }, "tests/test_text_generation_example.py::test_text_generation_fp8[mistralai/Mixtral-8x7B-v0.1-2-48-True-2048-2048]": { "gaudi2": { "throughput": 1147.5 + }, + "gaudi3": { + "throughput": 1147.5 } }, "tests/test_text_generation_example.py::test_text_generation_fp8[mistralai/Mixtral-8x7B-v0.1-2-768-True-128-128]": { "gaudi2": { "throughput": 3428.65 + }, + "gaudi3": { + "throughput": 3428.65 } }, "tests/test_text_generation_example.py::test_text_generation_fp8[mistralai/Mixtral-8x7B-v0.1-2-96-True-128-2048]": { "gaudi2": { "throughput": 2570.34 + }, + "gaudi3": { + "throughput": 2570.34 } }, "tests/test_text_generation_example.py::test_text_generation_fp8[mistralai/Mixtral-8x7B-v0.1-2-96-True-2048-128]": { "gaudi2": { "throughput": 379.03 + }, + "gaudi3": { + "throughput": 379.03 } }, "tests/test_text_generation_example.py::test_text_generation_fp8[tiiuae/falcon-180B-4-950-True-128-128]": { "gaudi2": { "throughput": 2506.68 + }, + "gaudi3": { + "throughput": 2506.68 } }, "tests/test_text_generation_example.py::test_text_generation_gptq[TheBloke/Llama-2-7b-Chat-GPTQ-1-10-False-128-2048]": { "gaudi2": { "throughput": 456.7 + }, + "gaudi3": { + "throughput": 456.7 } }, "tests/test_text_generation_example.py::test_text_generation_torch_compile[meta-llama/Llama-2-7b-hf]": { "gaudi2": { "throughput": 102.27823420713148 + }, + "gaudi3": { + "throughput": 102.27823420713148 } }, "tests/test_text_generation_example.py::test_text_generation_torch_compile_distributed[meta-llama/Llama-2-7b-hf]": { "gaudi2": { "throughput": 39.72973199515235 + }, + "gaudi3": { + "throughput": 39.72973199515235 } } } \ No newline at end of file diff --git a/tests/baselines/fixture/tests/test_video_llava.json b/tests/baselines/fixture/tests/test_video_llava.json index a37db23bd6..90146af1f5 100644 --- a/tests/baselines/fixture/tests/test_video_llava.json +++ b/tests/baselines/fixture/tests/test_video_llava.json @@ -5,6 +5,9 @@ }, "gaudi2": { "throughput": 27.72902536827787 + }, + "gaudi3": { + "throughput": 27.72902536827787 } } } \ No newline at end of file diff --git a/tests/baselines/fixture/tests/test_video_mae.json b/tests/baselines/fixture/tests/test_video_mae.json index 8388c9ff80..481c431a19 100644 --- a/tests/baselines/fixture/tests/test_video_mae.json +++ b/tests/baselines/fixture/tests/test_video_mae.json @@ -5,6 +5,9 @@ }, "gaudi2": { "latency": 17.544198036193848 + }, + "gaudi3": { + "latency": 17.544198036193848 } } } \ No newline at end of file diff --git a/tests/baselines/fixture/tests/test_zero_shot_object_detection.json b/tests/baselines/fixture/tests/test_zero_shot_object_detection.json index a98d4e6556..ec3779fbc4 100644 --- a/tests/baselines/fixture/tests/test_zero_shot_object_detection.json +++ b/tests/baselines/fixture/tests/test_zero_shot_object_detection.json @@ -5,6 +5,9 @@ }, "gaudi2": { "latency": 4.213955687819833 + }, + "gaudi3": { + "latency": 4.213955687819833 } } } \ No newline at end of file diff --git a/tests/configs/examples/CodeLlama_13b_Instruct_hf.json b/tests/configs/examples/CodeLlama_13b_Instruct_hf.json index d2c2aa86f9..576171fb1e 100644 --- a/tests/configs/examples/CodeLlama_13b_Instruct_hf.json +++ b/tests/configs/examples/CodeLlama_13b_Instruct_hf.json @@ -5,9 +5,36 @@ "eval_batch_size": 48, "distribution": { "deepspeed": { - "learning_rate": 5e-5, + "learning_rate": 5e-05, "train_batch_size": 48, - "metrics": ["perplexity", "train_runtime", "train_samples_per_second"], + "metrics": [ + "perplexity", + "train_runtime", + "train_samples_per_second" + ], + "extra_arguments": [ + "--dataset_config_name wikitext-2-raw-v1", + "--gradient_checkpointing", + "--use_hpu_graphs_for_inference", + "--deepspeed tests/configs/deepspeed_zero_1.json" + ] + } + } + } + }, + "gaudi3": { + "wikitext": { + "num_train_epochs": 1, + "eval_batch_size": 48, + "distribution": { + "deepspeed": { + "learning_rate": 5e-05, + "train_batch_size": 48, + "metrics": [ + "perplexity", + "train_runtime", + "train_samples_per_second" + ], "extra_arguments": [ "--dataset_config_name wikitext-2-raw-v1", "--gradient_checkpointing", @@ -18,4 +45,4 @@ } } } -} +} \ No newline at end of file diff --git a/tests/configs/examples/LlamaGuard_7b.json b/tests/configs/examples/LlamaGuard_7b.json index 704fe64c73..7b0a4e122d 100644 --- a/tests/configs/examples/LlamaGuard_7b.json +++ b/tests/configs/examples/LlamaGuard_7b.json @@ -5,9 +5,36 @@ "eval_batch_size": 8, "distribution": { "deepspeed": { - "learning_rate": 3e-5, + "learning_rate": 3e-05, "train_batch_size": 32, - "metrics": ["eval_f1", "train_runtime", "train_samples_per_second"], + "metrics": [ + "eval_f1", + "train_runtime", + "train_samples_per_second" + ], + "extra_arguments": [ + "--max_seq_length 128", + "--add_pad_token True", + "--use_hpu_graphs_for_inference", + "--deepspeed tests/configs/deepspeed_zero_2.json" + ] + } + } + } + }, + "gaudi3": { + "mrpc": { + "num_train_epochs": 3, + "eval_batch_size": 8, + "distribution": { + "deepspeed": { + "learning_rate": 3e-05, + "train_batch_size": 32, + "metrics": [ + "eval_f1", + "train_runtime", + "train_samples_per_second" + ], "extra_arguments": [ "--max_seq_length 128", "--add_pad_token True", @@ -18,4 +45,4 @@ } } } -} +} \ No newline at end of file diff --git a/tests/configs/examples/Llama_3_1_8B.json b/tests/configs/examples/Llama_3_1_8B.json index 4c57db9a6b..3e9edaaeb1 100644 --- a/tests/configs/examples/Llama_3_1_8B.json +++ b/tests/configs/examples/Llama_3_1_8B.json @@ -5,9 +5,48 @@ "eval_batch_size": 1, "distribution": { "single_card": { - "learning_rate": 3e-4, + "learning_rate": 0.0003, "train_batch_size": 10, - "metrics": ["perplexity", "train_runtime", "train_samples_per_second"], + "metrics": [ + "perplexity", + "train_runtime", + "train_samples_per_second" + ], + "extra_arguments": [ + "--bf16", + "--gradient_checkpointing", + "--eval_strategy epoch", + "--eval_delay 2", + "--save_strategy no", + "--warmup_ratio 0.03", + "--lr_scheduler_type cosine", + "--logging_steps 1", + "--lora_rank 4", + "--lora_target_modules q_proj v_proj", + "--dataset_concatenation", + "--max_seq_length 512", + "--validation_split_percentage 10", + "--attn_softmax_bf16", + "--use_flash_attention True", + "--flash_attention_causal_mask True" + ] + } + } + } + }, + "gaudi3": { + "tatsu-lab/alpaca": { + "num_train_epochs": 2, + "eval_batch_size": 1, + "distribution": { + "single_card": { + "learning_rate": 0.0003, + "train_batch_size": 10, + "metrics": [ + "perplexity", + "train_runtime", + "train_samples_per_second" + ], "extra_arguments": [ "--bf16", "--gradient_checkpointing", @@ -30,4 +69,4 @@ } } } -} +} \ No newline at end of file diff --git a/tests/configs/examples/Llama_3_2_11B_Vision_Instruct.json b/tests/configs/examples/Llama_3_2_11B_Vision_Instruct.json index fd8abaccfc..b378a77213 100644 --- a/tests/configs/examples/Llama_3_2_11B_Vision_Instruct.json +++ b/tests/configs/examples/Llama_3_2_11B_Vision_Instruct.json @@ -5,9 +5,51 @@ "eval_batch_size": 4, "distribution": { "multi_card": { - "learning_rate": 5e-5, + "learning_rate": 5e-05, "train_batch_size": 2, - "metrics": ["eval_accuracy", "train_runtime", "train_samples_per_second"], + "metrics": [ + "eval_accuracy", + "train_runtime", + "train_samples_per_second" + ], + "extra_arguments": [ + "--bf16", + "--gradient_accumulation_steps 8", + "--eval_strategy no", + "--save_strategy no", + "--warmup_steps 50", + "--lr_scheduler_type constant", + "--max_grad_norm 0.3", + "--logging_steps 1", + "--use_hpu_graphs_for_inference", + "--lora_rank 8", + "--lora_alpha 8", + "--lora_dropout 0.1", + "--lora_target_modules '.*(language_model).*(down_proj|gate_proj|up_proj|k_proj|q_proj|v_proj|o_proj).*$'", + "--low_cpu_mem_usage True", + "--adam_epsilon 1e-08", + "--input_column_name image query", + "--output_column_name answers", + "--remove_unused_columns False", + "--max_seq_length 512" + ] + } + } + } + }, + "gaudi3": { + "image2text_lora_finetune": { + "num_train_epochs": 1, + "eval_batch_size": 4, + "distribution": { + "multi_card": { + "learning_rate": 5e-05, + "train_batch_size": 2, + "metrics": [ + "eval_accuracy", + "train_runtime", + "train_samples_per_second" + ], "extra_arguments": [ "--bf16", "--gradient_accumulation_steps 8", @@ -33,4 +75,4 @@ } } } -} +} \ No newline at end of file diff --git a/tests/configs/examples/Qwen2_72B.json b/tests/configs/examples/Qwen2_72B.json index 848bb0238d..9ac52560aa 100644 --- a/tests/configs/examples/Qwen2_72B.json +++ b/tests/configs/examples/Qwen2_72B.json @@ -5,9 +5,60 @@ "eval_batch_size": 8, "distribution": { "deepspeed": { - "learning_rate": 3e-4, + "learning_rate": 0.0003, "train_batch_size": 8, - "metrics": ["perplexity", "train_runtime", "train_samples_per_second"], + "metrics": [ + "perplexity", + "train_runtime", + "train_samples_per_second" + ], + "extra_arguments": [ + "--bf16 True", + "--subset None", + "--streaming False", + "--packing False", + "--num_buckets 8", + "--gradient_accumulation_steps 8", + "--gradient_checkpointing True", + "--eval_strategy no", + "--save_strategy no", + "--throughput_warmup_steps 3", + "--learning_rate 3e-4", + "--warmup_ratio 0.03", + "--lr_scheduler_type cosine", + "--max_grad_norm 0.3", + "--logging_steps 1", + "--adam_epsilon 1e-8", + "--use_peft True", + "--lora_r 4", + "--lora_alpha 16", + "--lora_dropout 0.05", + "--lora_target_modules q_proj v_proj k_proj o_proj", + "--max_seq_length 512", + "--weight_decay 0.05", + "--report_to none", + "--max_steps 10", + "--gradient_checkpointing True", + "--pipelining_fwd_bwd True", + "--deepspeed tests/configs/deepspeed_zero_3_gaudi1.json" + ] + } + } + } + }, + "gaudi3": { + "trl-sft-qwen": { + "num_train_epochs": 1, + "eval_batch_size": 8, + "distribution": { + "deepspeed": { + "learning_rate": 0.0003, + "train_batch_size": 8, + "metrics": [ + "perplexity", + "train_runtime", + "train_samples_per_second" + ], "extra_arguments": [ "--bf16 True", "--subset None", @@ -42,4 +93,4 @@ } } } -} +} \ No newline at end of file diff --git a/tests/configs/examples/Qwen2_7B.json b/tests/configs/examples/Qwen2_7B.json index 23b4ea048a..56a74e084f 100644 --- a/tests/configs/examples/Qwen2_7B.json +++ b/tests/configs/examples/Qwen2_7B.json @@ -5,9 +5,12 @@ "eval_batch_size": 32, "distribution": { "multi_card": { - "learning_rate": 3e-4, + "learning_rate": 0.0003, "train_batch_size": 32, - "metrics": ["train_runtime", "train_samples_per_second"], + "metrics": [ + "train_runtime", + "train_samples_per_second" + ], "extra_arguments": [ "--bf16 True", "--subset ''", @@ -41,9 +44,88 @@ "eval_batch_size": 2, "distribution": { "multi_card": { - "learning_rate": 3e-4, + "learning_rate": 0.0003, "train_batch_size": 2, - "metrics": ["train_runtime", "train_samples_per_second"], + "metrics": [ + "train_runtime", + "train_samples_per_second" + ], + "extra_arguments": [ + "--bf16 True", + "--subset ''", + "--streaming False", + "--packing True", + "--gradient_accumulation_steps 8", + "--gradient_checkpointing True", + "--eval_strategy no", + "--save_strategy no", + "--throughput_warmup_steps 5", + "--warmup_ratio 0.03", + "--lr_scheduler_type cosine", + "--max_grad_norm 0.3", + "--logging_steps 1", + "--adam_epsilon 3e-4", + "--use_peft False", + "--max_seq_length 4096", + "--report_to none", + "--use_flash_attention True", + "--max_steps 20" + ] + } + } + } + }, + "gaudi3": { + "trl-sft-chat-peft": { + "num_train_epochs": 1, + "eval_batch_size": 32, + "distribution": { + "multi_card": { + "learning_rate": 0.0003, + "train_batch_size": 32, + "metrics": [ + "train_runtime", + "train_samples_per_second" + ], + "extra_arguments": [ + "--bf16 True", + "--subset ''", + "--streaming False", + "--packing True", + "--gradient_accumulation_steps 8", + "--gradient_checkpointing True", + "--eval_strategy no", + "--save_strategy no", + "--throughput_warmup_steps 5", + "--warmup_ratio 0.03", + "--lr_scheduler_type cosine", + "--max_grad_norm 0.3", + "--logging_steps 1", + "--adam_epsilon 3e-4", + "--use_peft True", + "--lora_r 4", + "--lora_alpha 16", + "--lora_dropout 0.05", + "--lora_target_modules q_proj v_proj k_proj o_proj", + "--max_seq_length 512", + "--weight_decay 0.05", + "--report_to none", + "--max_steps 20" + ] + } + } + }, + "trl-sft-chat": { + "num_train_epochs": 1, + "eval_batch_size": 2, + "distribution": { + "multi_card": { + "learning_rate": 0.0003, + "train_batch_size": 2, + "metrics": [ + "train_runtime", + "train_samples_per_second" + ], "extra_arguments": [ "--bf16 True", "--subset ''", @@ -69,4 +151,4 @@ } } } -} +} \ No newline at end of file diff --git a/tests/configs/examples/albert_large_v2.json b/tests/configs/examples/albert_large_v2.json index 59648caf3e..74dc607903 100644 --- a/tests/configs/examples/albert_large_v2.json +++ b/tests/configs/examples/albert_large_v2.json @@ -5,18 +5,26 @@ "eval_batch_size": 4, "distribution": { "single_card": { - "learning_rate": 6e-5, + "learning_rate": 6e-05, "train_batch_size": 32, - "metrics": ["eval_f1", "train_runtime", "train_samples_per_second"], + "metrics": [ + "eval_f1", + "train_runtime", + "train_samples_per_second" + ], "extra_arguments": [ "--max_seq_length 384", "--use_hpu_graphs_for_inference" ] }, "multi_card": { - "learning_rate": 6e-5, + "learning_rate": 6e-05, "train_batch_size": 32, - "metrics": ["eval_f1", "train_runtime", "train_samples_per_second"], + "metrics": [ + "eval_f1", + "train_runtime", + "train_samples_per_second" + ], "extra_arguments": [ "--max_seq_length 384", "--use_hpu_graphs_for_inference" @@ -31,18 +39,60 @@ "eval_batch_size": 4, "distribution": { "single_card": { - "learning_rate": 6e-5, + "learning_rate": 6e-05, "train_batch_size": 128, - "metrics": ["eval_f1", "train_runtime", "train_samples_per_second"], + "metrics": [ + "eval_f1", + "train_runtime", + "train_samples_per_second" + ], "extra_arguments": [ "--max_seq_length 384", "--use_hpu_graphs_for_inference" ] }, "multi_card": { - "learning_rate": 7e-5, + "learning_rate": 7e-05, "train_batch_size": 128, - "metrics": ["eval_f1", "train_runtime", "train_samples_per_second"], + "metrics": [ + "eval_f1", + "train_runtime", + "train_samples_per_second" + ], + "extra_arguments": [ + "--max_seq_length 384", + "--use_hpu_graphs_for_inference" + ] + } + } + } + }, + "gaudi3": { + "squad": { + "num_train_epochs": 2, + "eval_batch_size": 4, + "distribution": { + "single_card": { + "learning_rate": 6e-05, + "train_batch_size": 128, + "metrics": [ + "eval_f1", + "train_runtime", + "train_samples_per_second" + ], + "extra_arguments": [ + "--max_seq_length 384", + "--use_hpu_graphs_for_inference" + ] + }, + "multi_card": { + "learning_rate": 7e-05, + "train_batch_size": 128, + "metrics": [ + "eval_f1", + "train_runtime", + "train_samples_per_second" + ], "extra_arguments": [ "--max_seq_length 384", "--use_hpu_graphs_for_inference" @@ -51,4 +101,4 @@ } } } -} +} \ No newline at end of file diff --git a/tests/configs/examples/albert_xxlarge_v1.json b/tests/configs/examples/albert_xxlarge_v1.json index ebda527c92..9b8e04472c 100644 --- a/tests/configs/examples/albert_xxlarge_v1.json +++ b/tests/configs/examples/albert_xxlarge_v1.json @@ -5,18 +5,26 @@ "eval_batch_size": 2, "distribution": { "single_card": { - "learning_rate": 1e-5, + "learning_rate": 1e-05, "train_batch_size": 12, - "metrics": ["eval_f1", "train_runtime", "train_samples_per_second"], + "metrics": [ + "eval_f1", + "train_runtime", + "train_samples_per_second" + ], "extra_arguments": [ "--max_seq_length 384", "--use_hpu_graphs_for_inference" ] }, "multi_card": { - "learning_rate": 5e-5, + "learning_rate": 5e-05, "train_batch_size": 12, - "metrics": ["eval_f1", "train_runtime", "train_samples_per_second"], + "metrics": [ + "eval_f1", + "train_runtime", + "train_samples_per_second" + ], "extra_arguments": [ "--max_seq_length 384", "--use_hpu_graphs_for_inference" @@ -31,18 +39,60 @@ "eval_batch_size": 2, "distribution": { "single_card": { - "learning_rate": 2e-5, + "learning_rate": 2e-05, "train_batch_size": 16, - "metrics": ["eval_f1", "train_runtime", "train_samples_per_second"], + "metrics": [ + "eval_f1", + "train_runtime", + "train_samples_per_second" + ], "extra_arguments": [ "--max_seq_length 384", "--use_hpu_graphs_for_inference" ] }, "multi_card": { - "learning_rate": 7e-5, + "learning_rate": 7e-05, "train_batch_size": 16, - "metrics": ["eval_f1", "train_runtime", "train_samples_per_second"], + "metrics": [ + "eval_f1", + "train_runtime", + "train_samples_per_second" + ], + "extra_arguments": [ + "--max_seq_length 384", + "--use_hpu_graphs_for_inference" + ] + } + } + } + }, + "gaudi3": { + "squad": { + "num_train_epochs": 1, + "eval_batch_size": 2, + "distribution": { + "single_card": { + "learning_rate": 2e-05, + "train_batch_size": 16, + "metrics": [ + "eval_f1", + "train_runtime", + "train_samples_per_second" + ], + "extra_arguments": [ + "--max_seq_length 384", + "--use_hpu_graphs_for_inference" + ] + }, + "multi_card": { + "learning_rate": 7e-05, + "train_batch_size": 16, + "metrics": [ + "eval_f1", + "train_runtime", + "train_samples_per_second" + ], "extra_arguments": [ "--max_seq_length 384", "--use_hpu_graphs_for_inference" @@ -51,4 +101,4 @@ } } } -} +} \ No newline at end of file diff --git a/tests/configs/examples/ast_finetuned_speech_commands_v2.json b/tests/configs/examples/ast_finetuned_speech_commands_v2.json index b9c347222b..204d122a09 100644 --- a/tests/configs/examples/ast_finetuned_speech_commands_v2.json +++ b/tests/configs/examples/ast_finetuned_speech_commands_v2.json @@ -5,9 +5,46 @@ "eval_batch_size": 64, "distribution": { "multi_card": { - "learning_rate": 5e-4, + "learning_rate": 0.0005, "train_batch_size": 32, - "metrics": ["eval_accuracy", "train_runtime", "train_samples_per_second", "eval_samples_per_second"], + "metrics": [ + "eval_accuracy", + "train_runtime", + "train_samples_per_second", + "eval_samples_per_second" + ], + "extra_arguments": [ + "--audio_column_name audio", + "--label_column_name language", + "--remove_unused_columns False", + "--max_length_seconds 8", + "--attention_mask False", + "--warmup_ratio 0.1", + "--seed 0", + "--dataloader_num_workers 1", + "--ignore_mismatched_sizes=True", + "--use_hpu_graphs_for_training", + "--use_hpu_graphs_for_inference", + "--trust_remote_code True" + ] + } + } + } + }, + "gaudi3": { + "common_language": { + "num_train_epochs": 10, + "eval_batch_size": 64, + "distribution": { + "multi_card": { + "learning_rate": 0.0005, + "train_batch_size": 32, + "metrics": [ + "eval_accuracy", + "train_runtime", + "train_samples_per_second", + "eval_samples_per_second" + ], "extra_arguments": [ "--audio_column_name audio", "--label_column_name language", @@ -26,4 +63,4 @@ } } } -} +} \ No newline at end of file diff --git a/tests/configs/examples/bert_base_uncased.json b/tests/configs/examples/bert_base_uncased.json index 1960b5272a..c92b010e02 100644 --- a/tests/configs/examples/bert_base_uncased.json +++ b/tests/configs/examples/bert_base_uncased.json @@ -4,18 +4,26 @@ "eval_batch_size": 8, "distribution": { "single_card": { - "learning_rate": 5e-5, + "learning_rate": 5e-05, "train_batch_size": 24, - "metrics": ["eval_f1", "train_runtime", "train_samples_per_second"], + "metrics": [ + "eval_f1", + "train_runtime", + "train_samples_per_second" + ], "extra_arguments": [ "--max_seq_length 384", "--use_hpu_graphs_for_inference" ] }, "multi_card": { - "learning_rate": 2e-4, + "learning_rate": 0.0002, "train_batch_size": 24, - "metrics": ["eval_f1", "train_runtime", "train_samples_per_second"], + "metrics": [ + "eval_f1", + "train_runtime", + "train_samples_per_second" + ], "extra_arguments": [ "--max_seq_length 384", "--use_hpu_graphs_for_inference" @@ -28,18 +36,26 @@ "eval_batch_size": 8, "distribution": { "single_card": { - "learning_rate": 6e-5, + "learning_rate": 6e-05, "train_batch_size": 64, - "metrics": ["eval_f1", "train_runtime", "train_samples_per_second"], + "metrics": [ + "eval_f1", + "train_runtime", + "train_samples_per_second" + ], "extra_arguments": [ "--max_seq_length 128", "--use_hpu_graphs_for_inference" ] }, "multi_card": { - "learning_rate": 5e-4, + "learning_rate": 0.0005, "train_batch_size": 64, - "metrics": ["eval_f1", "train_runtime", "train_samples_per_second"], + "metrics": [ + "eval_f1", + "train_runtime", + "train_samples_per_second" + ], "extra_arguments": [ "--max_seq_length 128", "--use_hpu_graphs_for_inference" @@ -47,4 +63,4 @@ } } } -} +} \ No newline at end of file diff --git a/tests/configs/examples/bert_large_uncased_whole_word_masking.json b/tests/configs/examples/bert_large_uncased_whole_word_masking.json index f3a0d79692..e3ed43e39d 100755 --- a/tests/configs/examples/bert_large_uncased_whole_word_masking.json +++ b/tests/configs/examples/bert_large_uncased_whole_word_masking.json @@ -5,18 +5,26 @@ "eval_batch_size": 8, "distribution": { "single_card": { - "learning_rate": 3e-5, + "learning_rate": 3e-05, "train_batch_size": 24, - "metrics": ["eval_f1", "train_runtime", "train_samples_per_second"], + "metrics": [ + "eval_f1", + "train_runtime", + "train_samples_per_second" + ], "extra_arguments": [ "--max_seq_length 384", "--use_hpu_graphs_for_inference" ] }, "multi_card": { - "learning_rate": 7e-5, + "learning_rate": 7e-05, "train_batch_size": 24, - "metrics": ["eval_f1", "train_runtime", "train_samples_per_second"], + "metrics": [ + "eval_f1", + "train_runtime", + "train_samples_per_second" + ], "extra_arguments": [ "--max_seq_length 384", "--use_hpu_graphs_for_inference" @@ -29,18 +37,26 @@ "eval_batch_size": 8, "distribution": { "single_card": { - "learning_rate": 3e-5, + "learning_rate": 3e-05, "train_batch_size": 32, - "metrics": ["eval_f1", "train_runtime", "train_samples_per_second"], + "metrics": [ + "eval_f1", + "train_runtime", + "train_samples_per_second" + ], "extra_arguments": [ "--max_seq_length 128", "--use_hpu_graphs_for_inference" ] }, "multi_card": { - "learning_rate": 3e-5, + "learning_rate": 3e-05, "train_batch_size": 16, - "metrics": ["eval_f1", "train_runtime", "train_samples_per_second"], + "metrics": [ + "eval_f1", + "train_runtime", + "train_samples_per_second" + ], "extra_arguments": [ "--max_seq_length 128", "--use_hpu_graphs_for_inference" @@ -55,18 +71,26 @@ "eval_batch_size": 8, "distribution": { "single_card": { - "learning_rate": 3e-5, + "learning_rate": 3e-05, "train_batch_size": 32, - "metrics": ["eval_f1", "train_runtime", "train_samples_per_second"], + "metrics": [ + "eval_f1", + "train_runtime", + "train_samples_per_second" + ], "extra_arguments": [ "--max_seq_length 384", "--use_hpu_graphs_for_inference" ] }, "multi_card": { - "learning_rate": 3e-5, + "learning_rate": 3e-05, "train_batch_size": 32, - "metrics": ["eval_f1", "train_runtime", "train_samples_per_second"], + "metrics": [ + "eval_f1", + "train_runtime", + "train_samples_per_second" + ], "extra_arguments": [ "--max_seq_length 384", "--use_hpu_graphs_for_inference" @@ -79,18 +103,92 @@ "eval_batch_size": 8, "distribution": { "single_card": { - "learning_rate": 3e-5, + "learning_rate": 3e-05, "train_batch_size": 256, - "metrics": ["eval_f1", "train_runtime", "train_samples_per_second"], + "metrics": [ + "eval_f1", + "train_runtime", + "train_samples_per_second" + ], "extra_arguments": [ "--max_seq_length 128", "--use_hpu_graphs_for_inference" ] }, "multi_card": { - "learning_rate": 3e-5, + "learning_rate": 3e-05, "train_batch_size": 40, - "metrics": ["eval_f1", "train_runtime", "train_samples_per_second"], + "metrics": [ + "eval_f1", + "train_runtime", + "train_samples_per_second" + ], + "extra_arguments": [ + "--max_seq_length 128", + "--use_hpu_graphs_for_inference" + ] + } + } + } + }, + "gaudi3": { + "squad": { + "num_train_epochs": 1, + "eval_batch_size": 8, + "distribution": { + "single_card": { + "learning_rate": 3e-05, + "train_batch_size": 32, + "metrics": [ + "eval_f1", + "train_runtime", + "train_samples_per_second" + ], + "extra_arguments": [ + "--max_seq_length 384", + "--use_hpu_graphs_for_inference" + ] + }, + "multi_card": { + "learning_rate": 3e-05, + "train_batch_size": 32, + "metrics": [ + "eval_f1", + "train_runtime", + "train_samples_per_second" + ], + "extra_arguments": [ + "--max_seq_length 384", + "--use_hpu_graphs_for_inference" + ] + } + } + }, + "mrpc": { + "num_train_epochs": 3, + "eval_batch_size": 8, + "distribution": { + "single_card": { + "learning_rate": 3e-05, + "train_batch_size": 256, + "metrics": [ + "eval_f1", + "train_runtime", + "train_samples_per_second" + ], + "extra_arguments": [ + "--max_seq_length 128", + "--use_hpu_graphs_for_inference" + ] + }, + "multi_card": { + "learning_rate": 3e-05, + "train_batch_size": 40, + "metrics": [ + "eval_f1", + "train_runtime", + "train_samples_per_second" + ], "extra_arguments": [ "--max_seq_length 128", "--use_hpu_graphs_for_inference" @@ -99,4 +197,4 @@ } } } -} +} \ No newline at end of file diff --git a/tests/configs/examples/bloom_7b1.json b/tests/configs/examples/bloom_7b1.json index 9de0a72315..b017f499fe 100644 --- a/tests/configs/examples/bloom_7b1.json +++ b/tests/configs/examples/bloom_7b1.json @@ -5,9 +5,12 @@ "eval_batch_size": 4, "distribution": { "deepspeed": { - "learning_rate": 1e-4, + "learning_rate": 0.0001, "train_batch_size": 8, - "metrics": ["train_runtime", "train_samples_per_second"], + "metrics": [ + "train_runtime", + "train_samples_per_second" + ], "extra_arguments": [ "--dataset_config_name wikitext-2-raw-v1", "--use_cache False", @@ -19,4 +22,4 @@ } } } -} +} \ No newline at end of file diff --git a/tests/configs/examples/bridgetower_large_itm_mlm_itc.json b/tests/configs/examples/bridgetower_large_itm_mlm_itc.json index 6dce3b79dc..52da14a07f 100644 --- a/tests/configs/examples/bridgetower_large_itm_mlm_itc.json +++ b/tests/configs/examples/bridgetower_large_itm_mlm_itc.json @@ -5,9 +5,40 @@ "eval_batch_size": 16, "distribution": { "multi_card": { - "learning_rate": 1e-5, + "learning_rate": 1e-05, "train_batch_size": 48, - "metrics": ["train_runtime", "train_samples_per_second"], + "metrics": [ + "train_runtime", + "train_samples_per_second" + ], + "extra_arguments": [ + "--dataset_config_name matching", + "--dataset_revision 3c6c4f6c0ff7e902833d3afa5f8f3875c2b036e6", + "--image_column image", + "--caption_column image_description", + "--remove_unused_columns False", + "--mediapipe_dataloader", + "--dataloader_num_workers 2", + "--logging_steps 10", + "--use_hpu_graphs_for_inference", + "--trust_remote_code True" + ] + } + } + } + }, + "gaudi3": { + "jmhessel/newyorker_caption_contest": { + "num_train_epochs": 5, + "eval_batch_size": 16, + "distribution": { + "multi_card": { + "learning_rate": 1e-05, + "train_batch_size": 48, + "metrics": [ + "train_runtime", + "train_samples_per_second" + ], "extra_arguments": [ "--dataset_config_name matching", "--dataset_revision 3c6c4f6c0ff7e902833d3afa5f8f3875c2b036e6", @@ -24,4 +55,4 @@ } } } -} +} \ No newline at end of file diff --git a/tests/configs/examples/chatglm3_6b.json b/tests/configs/examples/chatglm3_6b.json index ce55433e91..450e0eca41 100644 --- a/tests/configs/examples/chatglm3_6b.json +++ b/tests/configs/examples/chatglm3_6b.json @@ -5,9 +5,44 @@ "eval_batch_size": 4, "distribution": { "deepspeed": { - "learning_rate": 5e-5, + "learning_rate": 5e-05, "train_batch_size": 4, - "metrics": ["perplexity", "train_runtime", "train_samples_per_second"], + "metrics": [ + "perplexity", + "train_runtime", + "train_samples_per_second" + ], + "extra_arguments": [ + "--dataset_name wikitext", + "--dataset_config_name wikitext-2-raw-v1", + "--block_size 1024", + "--use_cache False", + "--gradient_checkpointing", + "--bf16", + "--eval_strategy no", + "--save_strategy no", + "--throughput_warmup_steps 3", + "--logging_first_step True", + "--logging_steps 20", + "--deepspeed tests/configs/deepspeed_zero_3_gaudi1.json" + ] + } + } + } + }, + "gaudi3": { + "wikitext": { + "num_train_epochs": 3, + "eval_batch_size": 4, + "distribution": { + "deepspeed": { + "learning_rate": 5e-05, + "train_batch_size": 4, + "metrics": [ + "perplexity", + "train_runtime", + "train_samples_per_second" + ], "extra_arguments": [ "--dataset_name wikitext", "--dataset_config_name wikitext-2-raw-v1", @@ -26,4 +61,4 @@ } } } -} +} \ No newline at end of file diff --git a/tests/configs/examples/clip_roberta.json b/tests/configs/examples/clip_roberta.json index 37e9d1f5cc..87a28d40e3 100755 --- a/tests/configs/examples/clip_roberta.json +++ b/tests/configs/examples/clip_roberta.json @@ -5,9 +5,12 @@ "eval_batch_size": 64, "distribution": { "multi_card": { - "learning_rate": 5e-5, + "learning_rate": 5e-05, "train_batch_size": 64, - "metrics": ["train_runtime", "train_samples_per_second"], + "metrics": [ + "train_runtime", + "train_samples_per_second" + ], "extra_arguments": [ "--data_dir $PWD/", "--dataset_config_name 2017", @@ -34,7 +37,42 @@ "multi_card": { "learning_rate": 5e-05, "train_batch_size": 512, - "metrics": ["train_runtime", "train_samples_per_second"], + "metrics": [ + "train_runtime", + "train_samples_per_second" + ], + "extra_arguments": [ + "--data_dir $PWD/", + "--dataset_config_name 2017", + "--image_column image_path", + "--caption_column caption", + "--remove_unused_columns False", + "--warmup_steps 0", + "--weight_decay 0.1", + "--save_strategy no", + "--use_hpu_graphs", + "--dataloader_num_workers 2", + "--mediapipe_dataloader", + "--logging_nan_inf_filter", + "--trust_remote_code True", + "--max_steps 100" + ] + } + } + } + }, + "gaudi3": { + "ydshieh/coco_dataset_script": { + "eval_batch_size": 64, + "num_train_epochs": 1, + "distribution": { + "multi_card": { + "learning_rate": 5e-05, + "train_batch_size": 512, + "metrics": [ + "train_runtime", + "train_samples_per_second" + ], "extra_arguments": [ "--data_dir $PWD/", "--dataset_config_name 2017", @@ -55,4 +93,4 @@ } } } -} +} \ No newline at end of file diff --git a/tests/configs/examples/distilbert_base_uncased.json b/tests/configs/examples/distilbert_base_uncased.json index 0eb215102a..fb1900c312 100644 --- a/tests/configs/examples/distilbert_base_uncased.json +++ b/tests/configs/examples/distilbert_base_uncased.json @@ -5,18 +5,26 @@ "eval_batch_size": 8, "distribution": { "single_card": { - "learning_rate": 1e-4, + "learning_rate": 0.0001, "train_batch_size": 48, - "metrics": ["eval_f1", "train_runtime", "train_samples_per_second"], + "metrics": [ + "eval_f1", + "train_runtime", + "train_samples_per_second" + ], "extra_arguments": [ "--max_seq_length 384", "--use_hpu_graphs_for_inference" ] }, "multi_card": { - "learning_rate": 4e-4, + "learning_rate": 0.0004, "train_batch_size": 48, - "metrics": ["eval_f1", "train_runtime", "train_samples_per_second"], + "metrics": [ + "eval_f1", + "train_runtime", + "train_samples_per_second" + ], "extra_arguments": [ "--max_seq_length 384", "--use_hpu_graphs_for_inference" @@ -31,18 +39,60 @@ "eval_batch_size": 8, "distribution": { "single_card": { - "learning_rate": 2e-4, + "learning_rate": 0.0002, "train_batch_size": 64, - "metrics": ["eval_f1", "train_runtime", "train_samples_per_second"], + "metrics": [ + "eval_f1", + "train_runtime", + "train_samples_per_second" + ], "extra_arguments": [ "--max_seq_length 384", "--use_hpu_graphs_for_inference" ] }, "multi_card": { - "learning_rate": 3e-4, + "learning_rate": 0.0003, "train_batch_size": 64, - "metrics": ["eval_f1", "train_runtime", "train_samples_per_second"], + "metrics": [ + "eval_f1", + "train_runtime", + "train_samples_per_second" + ], + "extra_arguments": [ + "--max_seq_length 384", + "--use_hpu_graphs_for_inference" + ] + } + } + } + }, + "gaudi3": { + "squad": { + "num_train_epochs": 2, + "eval_batch_size": 8, + "distribution": { + "single_card": { + "learning_rate": 0.0002, + "train_batch_size": 64, + "metrics": [ + "eval_f1", + "train_runtime", + "train_samples_per_second" + ], + "extra_arguments": [ + "--max_seq_length 384", + "--use_hpu_graphs_for_inference" + ] + }, + "multi_card": { + "learning_rate": 0.0003, + "train_batch_size": 64, + "metrics": [ + "eval_f1", + "train_runtime", + "train_samples_per_second" + ], "extra_arguments": [ "--max_seq_length 384", "--use_hpu_graphs_for_inference" @@ -51,4 +101,4 @@ } } } -} +} \ No newline at end of file diff --git a/tests/configs/examples/falcon_40b.json b/tests/configs/examples/falcon_40b.json index 73c0ef93be..f499aec61c 100644 --- a/tests/configs/examples/falcon_40b.json +++ b/tests/configs/examples/falcon_40b.json @@ -5,9 +5,13 @@ "eval_batch_size": 1, "distribution": { "multi_card": { - "learning_rate": 4e-4, + "learning_rate": 0.0004, "train_batch_size": 1, - "metrics": ["perplexity", "train_runtime", "train_samples_per_second"], + "metrics": [ + "perplexity", + "train_runtime", + "train_samples_per_second" + ], "extra_arguments": [ "--bf16", "--gradient_accumulation_steps 16", @@ -38,9 +42,13 @@ "eval_batch_size": 1, "distribution": { "multi_card": { - "learning_rate": 4e-4, + "learning_rate": 0.0004, "train_batch_size": 1, - "metrics": ["perplexity", "train_runtime", "train_samples_per_second"], + "metrics": [ + "perplexity", + "train_runtime", + "train_samples_per_second" + ], "extra_arguments": [ "--bf16", "--gradient_accumulation_steps 16", @@ -64,6 +72,81 @@ ] } } - } + } + }, + "gaudi3": { + "timdettmers/openassistant-guanaco": { + "num_train_epochs": 1, + "eval_batch_size": 1, + "distribution": { + "multi_card": { + "learning_rate": 0.0004, + "train_batch_size": 1, + "metrics": [ + "perplexity", + "train_runtime", + "train_samples_per_second" + ], + "extra_arguments": [ + "--bf16", + "--gradient_accumulation_steps 16", + "--eval_strategy no", + "--save_strategy no", + "--warmup_ratio 0.03", + "--lr_scheduler_type constant", + "--max_grad_norm 0.3", + "--logging_steps 1", + "--use_hpu_graphs_for_inference", + "--lora_rank 64", + "--lora_alpha 16", + "--lora_dropout 0.1", + "--lora_target_modules query_key_value dense dense_h_to_4h dense_4h_to_h", + "--dataset_concatenation", + "--max_seq_length 256", + "--low_cpu_mem_usage True", + "--adam_epsilon 1e-08", + "--ddp_bucket_cap_mb 50", + "--pipelining_fwd_bwd", + "--validation_split_percentage 10" + ] + } + } + }, + "mamamiya405/finred": { + "num_train_epochs": 1, + "eval_batch_size": 1, + "distribution": { + "multi_card": { + "learning_rate": 0.0004, + "train_batch_size": 1, + "metrics": [ + "perplexity", + "train_runtime", + "train_samples_per_second" + ], + "extra_arguments": [ + "--bf16", + "--gradient_accumulation_steps 16", + "--eval_strategy no", + "--save_strategy no", + "--warmup_ratio 0.03", + "--lr_scheduler_type constant", + "--max_grad_norm 0.3", + "--logging_steps 1", + "--use_hpu_graphs_for_inference", + "--lora_rank 64", + "--lora_alpha 16", + "--lora_dropout 0.1", + "--lora_target_modules query_key_value dense dense_h_to_4h dense_4h_to_h", + "--max_seq_length 256", + "--low_cpu_mem_usage True", + "--adam_epsilon 1e-08", + "--ddp_bucket_cap_mb 50", + "--pipelining_fwd_bwd", + "--validation_split_percentage 10" + ] + } + } + } } -} +} \ No newline at end of file diff --git a/tests/configs/examples/flan_t5_xxl.json b/tests/configs/examples/flan_t5_xxl.json index 3f67ea03b3..f16d13c882 100644 --- a/tests/configs/examples/flan_t5_xxl.json +++ b/tests/configs/examples/flan_t5_xxl.json @@ -5,9 +5,43 @@ "eval_batch_size": 22, "distribution": { "deepspeed": { - "learning_rate": 1e-4, + "learning_rate": 0.0001, "train_batch_size": 22, - "metrics": ["eval_rougeLsum", "train_runtime", "train_samples_per_second"], + "metrics": [ + "eval_rougeLsum", + "train_runtime", + "train_samples_per_second" + ], + "extra_arguments": [ + "--max_steps 20", + "--max_eval_samples 880", + "--dataset_config 3.0.0", + "--source_prefix summarize: ", + "--predict_with_generate", + "--ignore_pad_token_for_loss False", + "--pad_to_max_length", + "--generation_max_length 129", + "--gradient_checkpointing", + "--adam_epsilon 1e-08", + "--deepspeed examples/summarization/ds_flan_t5_z3_config_bf16.json" + ] + } + } + } + }, + "gaudi3": { + "cnn_dailymail": { + "num_train_epochs": 2, + "eval_batch_size": 22, + "distribution": { + "deepspeed": { + "learning_rate": 0.0001, + "train_batch_size": 22, + "metrics": [ + "eval_rougeLsum", + "train_runtime", + "train_samples_per_second" + ], "extra_arguments": [ "--max_steps 20", "--max_eval_samples 880", @@ -25,4 +59,4 @@ } } } -} +} \ No newline at end of file diff --git a/tests/configs/examples/gemma_2b_it.json b/tests/configs/examples/gemma_2b_it.json index cfea562791..6ecab478ad 100644 --- a/tests/configs/examples/gemma_2b_it.json +++ b/tests/configs/examples/gemma_2b_it.json @@ -5,27 +5,87 @@ "eval_batch_size": 4, "distribution": { "single_card": { - "learning_rate": 2e-4, + "learning_rate": 0.0002, "train_batch_size": 4, - "metrics": ["perplexity", "train_runtime", "train_samples_per_second"], + "metrics": [ + "perplexity", + "train_runtime", + "train_samples_per_second" + ], "extra_arguments": [ "--dataset_config_name wikitext-2-raw-v1", "--use_hpu_graphs_for_inference" ] }, "multi_card": { - "learning_rate": 8e-4, + "learning_rate": 0.0008, "train_batch_size": 4, - "metrics": ["perplexity", "train_runtime", "train_samples_per_second"], + "metrics": [ + "perplexity", + "train_runtime", + "train_samples_per_second" + ], "extra_arguments": [ "--dataset_config_name wikitext-2-raw-v1", "--use_hpu_graphs_for_inference" ] }, "deepspeed": { - "learning_rate": 8e-4, + "learning_rate": 0.0008, "train_batch_size": 4, - "metrics": ["perplexity", "train_runtime", "train_samples_per_second"], + "metrics": [ + "perplexity", + "train_runtime", + "train_samples_per_second" + ], + "extra_arguments": [ + "--dataset_config_name wikitext-2-raw-v1", + "--use_hpu_graphs_for_inference", + "--deepspeed tests/configs/deepspeed_zero_2.json" + ] + } + } + } + }, + "gaudi3": { + "wikitext": { + "num_train_epochs": 2, + "eval_batch_size": 4, + "distribution": { + "single_card": { + "learning_rate": 0.0002, + "train_batch_size": 4, + "metrics": [ + "perplexity", + "train_runtime", + "train_samples_per_second" + ], + "extra_arguments": [ + "--dataset_config_name wikitext-2-raw-v1", + "--use_hpu_graphs_for_inference" + ] + }, + "multi_card": { + "learning_rate": 0.0008, + "train_batch_size": 4, + "metrics": [ + "perplexity", + "train_runtime", + "train_samples_per_second" + ], + "extra_arguments": [ + "--dataset_config_name wikitext-2-raw-v1", + "--use_hpu_graphs_for_inference" + ] + }, + "deepspeed": { + "learning_rate": 0.0008, + "train_batch_size": 4, + "metrics": [ + "perplexity", + "train_runtime", + "train_samples_per_second" + ], "extra_arguments": [ "--dataset_config_name wikitext-2-raw-v1", "--use_hpu_graphs_for_inference", @@ -35,4 +95,4 @@ } } } -} +} \ No newline at end of file diff --git a/tests/configs/examples/gemma_2b_it_eager.json b/tests/configs/examples/gemma_2b_it_eager.json index 09808d99d5..ea7993094d 100644 --- a/tests/configs/examples/gemma_2b_it_eager.json +++ b/tests/configs/examples/gemma_2b_it_eager.json @@ -5,9 +5,33 @@ "eval_batch_size": 4, "distribution": { "single_card": { - "learning_rate": 2e-4, + "learning_rate": 0.0002, "train_batch_size": 4, - "metrics": ["perplexity", "train_runtime", "train_samples_per_second"], + "metrics": [ + "perplexity", + "train_runtime", + "train_samples_per_second" + ], + "extra_arguments": [ + "--dataset_config_name wikitext-2-raw-v1" + ] + } + } + } + }, + "gaudi3": { + "wikitext": { + "num_train_epochs": 2, + "eval_batch_size": 4, + "distribution": { + "single_card": { + "learning_rate": 0.0002, + "train_batch_size": 4, + "metrics": [ + "perplexity", + "train_runtime", + "train_samples_per_second" + ], "extra_arguments": [ "--dataset_config_name wikitext-2-raw-v1" ] @@ -15,4 +39,4 @@ } } } -} +} \ No newline at end of file diff --git a/tests/configs/examples/gpt2.json b/tests/configs/examples/gpt2.json index 747ec83dd8..59d09b7264 100644 --- a/tests/configs/examples/gpt2.json +++ b/tests/configs/examples/gpt2.json @@ -5,9 +5,13 @@ "eval_batch_size": 4, "distribution": { "single_card": { - "learning_rate": 5e-5, + "learning_rate": 5e-05, "train_batch_size": 4, - "metrics": ["perplexity", "train_runtime", "train_samples_per_second"], + "metrics": [ + "perplexity", + "train_runtime", + "train_samples_per_second" + ], "extra_arguments": [ "--dataset_config_name wikitext-2-raw-v1", "--use_hpu_graphs_for_inference", @@ -15,9 +19,13 @@ ] }, "multi_card": { - "learning_rate": 4e-4, + "learning_rate": 0.0004, "train_batch_size": 4, - "metrics": ["perplexity", "train_runtime", "train_samples_per_second"], + "metrics": [ + "perplexity", + "train_runtime", + "train_samples_per_second" + ], "extra_arguments": [ "--dataset_config_name wikitext-2-raw-v1", "--use_hpu_graphs_for_inference", @@ -33,18 +41,60 @@ "eval_batch_size": 4, "distribution": { "single_card": { - "learning_rate": 2e-4, + "learning_rate": 0.0002, "train_batch_size": 16, - "metrics": ["perplexity", "train_runtime", "train_samples_per_second"], + "metrics": [ + "perplexity", + "train_runtime", + "train_samples_per_second" + ], "extra_arguments": [ "--dataset_config_name wikitext-2-raw-v1", "--use_hpu_graphs_for_inference" ] }, "multi_card": { - "learning_rate": 8e-4, + "learning_rate": 0.0008, "train_batch_size": 16, - "metrics": ["perplexity", "train_runtime", "train_samples_per_second"], + "metrics": [ + "perplexity", + "train_runtime", + "train_samples_per_second" + ], + "extra_arguments": [ + "--dataset_config_name wikitext-2-raw-v1", + "--use_hpu_graphs_for_inference" + ] + } + } + } + }, + "gaudi3": { + "wikitext": { + "num_train_epochs": 2, + "eval_batch_size": 4, + "distribution": { + "single_card": { + "learning_rate": 0.0002, + "train_batch_size": 16, + "metrics": [ + "perplexity", + "train_runtime", + "train_samples_per_second" + ], + "extra_arguments": [ + "--dataset_config_name wikitext-2-raw-v1", + "--use_hpu_graphs_for_inference" + ] + }, + "multi_card": { + "learning_rate": 0.0008, + "train_batch_size": 16, + "metrics": [ + "perplexity", + "train_runtime", + "train_samples_per_second" + ], "extra_arguments": [ "--dataset_config_name wikitext-2-raw-v1", "--use_hpu_graphs_for_inference" @@ -53,4 +103,4 @@ } } } -} +} \ No newline at end of file diff --git a/tests/configs/examples/gpt2_xl.json b/tests/configs/examples/gpt2_xl.json index eb89da6d27..eeb9398b73 100644 --- a/tests/configs/examples/gpt2_xl.json +++ b/tests/configs/examples/gpt2_xl.json @@ -5,12 +5,16 @@ "eval_batch_size": 4, "distribution": { "deepspeed": { - "learning_rate": 5e-5, + "learning_rate": 5e-05, "train_batch_size": 2, "perplexity": 12.6744, "train_runtime": 366.8694, "train_samples_per_second": 16.464, - "metrics": ["perplexity", "train_runtime", "train_samples_per_second"], + "metrics": [ + "perplexity", + "train_runtime", + "train_samples_per_second" + ], "extra_arguments": [ "--dataset_config_name wikitext-2-raw-v1", "--use_hpu_graphs_for_inference", @@ -26,9 +30,36 @@ "eval_batch_size": 4, "distribution": { "deepspeed": { - "learning_rate": 4e-4, + "learning_rate": 0.0004, "train_batch_size": 16, - "metrics": ["perplexity", "train_runtime", "train_samples_per_second"], + "metrics": [ + "perplexity", + "train_runtime", + "train_samples_per_second" + ], + "extra_arguments": [ + "--dataset_config_name wikitext-2-raw-v1", + "--gradient_checkpointing", + "--use_hpu_graphs_for_inference", + "--deepspeed tests/configs/deepspeed_zero_2.json" + ] + } + } + } + }, + "gaudi3": { + "wikitext": { + "num_train_epochs": 2, + "eval_batch_size": 4, + "distribution": { + "deepspeed": { + "learning_rate": 0.0004, + "train_batch_size": 16, + "metrics": [ + "perplexity", + "train_runtime", + "train_samples_per_second" + ], "extra_arguments": [ "--dataset_config_name wikitext-2-raw-v1", "--gradient_checkpointing", @@ -39,4 +70,4 @@ } } } -} +} \ No newline at end of file diff --git a/tests/configs/examples/gpt_neox_20b.json b/tests/configs/examples/gpt_neox_20b.json index 5a68691c16..0ed304101f 100644 --- a/tests/configs/examples/gpt_neox_20b.json +++ b/tests/configs/examples/gpt_neox_20b.json @@ -5,9 +5,36 @@ "eval_batch_size": 2, "distribution": { "deepspeed": { - "learning_rate": 5e-5, + "learning_rate": 5e-05, "train_batch_size": 2, - "metrics": ["perplexity", "train_runtime", "train_samples_per_second"], + "metrics": [ + "perplexity", + "train_runtime", + "train_samples_per_second" + ], + "extra_arguments": [ + "--dataset_config_name wikitext-2-raw-v1", + "--gradient_checkpointing", + "--use_hpu_graphs_for_inference", + "--deepspeed tests/configs/deepspeed_zero_2.json" + ] + } + } + } + }, + "gaudi3": { + "wikitext": { + "num_train_epochs": 1, + "eval_batch_size": 2, + "distribution": { + "deepspeed": { + "learning_rate": 5e-05, + "train_batch_size": 2, + "metrics": [ + "perplexity", + "train_runtime", + "train_samples_per_second" + ], "extra_arguments": [ "--dataset_config_name wikitext-2-raw-v1", "--gradient_checkpointing", @@ -18,4 +45,4 @@ } } } -} +} \ No newline at end of file diff --git a/tests/configs/examples/idefics2_8b.json b/tests/configs/examples/idefics2_8b.json index c74f37ecee..45a40adfd3 100644 --- a/tests/configs/examples/idefics2_8b.json +++ b/tests/configs/examples/idefics2_8b.json @@ -5,9 +5,51 @@ "eval_batch_size": 4, "distribution": { "multi_card": { - "learning_rate": 5e-5, + "learning_rate": 5e-05, "train_batch_size": 2, - "metrics": ["eval_accuracy", "train_runtime", "train_samples_per_second"], + "metrics": [ + "eval_accuracy", + "train_runtime", + "train_samples_per_second" + ], + "extra_arguments": [ + "--bf16", + "--gradient_accumulation_steps 8", + "--eval_strategy no", + "--save_strategy no", + "--warmup_steps 50", + "--lr_scheduler_type constant", + "--max_grad_norm 0.3", + "--logging_steps 1", + "--use_hpu_graphs_for_inference", + "--lora_rank 8", + "--lora_alpha 8", + "--lora_dropout 0.1", + "--lora_target_modules '.*(text_model|modality_projection|perceiver_resampler).*(down_proj|gate_proj|up_proj|k_proj|q_proj|v_proj|o_proj).*$'", + "--low_cpu_mem_usage True", + "--adam_epsilon 1e-08", + "--input_column_name image query", + "--output_column_name answers", + "--remove_unused_columns False", + "--max_seq_length 512" + ] + } + } + } + }, + "gaudi3": { + "image2text_lora_finetune": { + "num_train_epochs": 2, + "eval_batch_size": 4, + "distribution": { + "multi_card": { + "learning_rate": 5e-05, + "train_batch_size": 2, + "metrics": [ + "eval_accuracy", + "train_runtime", + "train_samples_per_second" + ], "extra_arguments": [ "--bf16", "--gradient_accumulation_steps 8", @@ -33,4 +75,4 @@ } } } -} +} \ No newline at end of file diff --git a/tests/configs/examples/llama_7b.json b/tests/configs/examples/llama_7b.json index 29c4a23e0a..d3d4a8ffeb 100644 --- a/tests/configs/examples/llama_7b.json +++ b/tests/configs/examples/llama_7b.json @@ -5,9 +5,13 @@ "eval_batch_size": 2, "distribution": { "single_card": { - "learning_rate": 2e-4, + "learning_rate": 0.0002, "train_batch_size": 2, - "metrics": ["perplexity", "train_runtime", "train_samples_per_second"], + "metrics": [ + "perplexity", + "train_runtime", + "train_samples_per_second" + ], "extra_arguments": [ "--bf16", "--gradient_accumulation_steps 1", @@ -39,9 +43,13 @@ "eval_batch_size": 2, "distribution": { "multi_card": { - "learning_rate": 1e-4, + "learning_rate": 0.0001, "train_batch_size": 2, - "metrics": ["perplexity", "train_runtime", "train_samples_per_second"], + "metrics": [ + "perplexity", + "train_runtime", + "train_samples_per_second" + ], "extra_arguments": [ "--bf16", "--gradient_accumulation_steps 4", @@ -62,9 +70,13 @@ "eval_batch_size": 8, "distribution": { "single_card": { - "learning_rate": 2e-4, + "learning_rate": 0.0002, "train_batch_size": 16, - "metrics": ["perplexity", "train_runtime", "train_samples_per_second"], + "metrics": [ + "perplexity", + "train_runtime", + "train_samples_per_second" + ], "extra_arguments": [ "--bf16", "--gradient_accumulation_steps 1", @@ -96,9 +108,13 @@ "eval_batch_size": 4, "distribution": { "multi_card": { - "learning_rate": 3e-4, + "learning_rate": 0.0003, "train_batch_size": 8, - "metrics": ["perplexity", "train_runtime", "train_samples_per_second"], + "metrics": [ + "perplexity", + "train_runtime", + "train_samples_per_second" + ], "extra_arguments": [ "--bf16", "--gradient_accumulation_steps 2", @@ -129,9 +145,13 @@ "eval_batch_size": 4, "distribution": { "multi_card": { - "learning_rate": 3e-4, + "learning_rate": 0.0003, "train_batch_size": 8, - "metrics": ["perplexity", "train_runtime", "train_samples_per_second"], + "metrics": [ + "perplexity", + "train_runtime", + "train_samples_per_second" + ], "extra_arguments": [ "--bf16", "--gradient_accumulation_steps 2", @@ -161,9 +181,13 @@ "eval_batch_size": 1, "distribution": { "multi_card": { - "learning_rate": 3e-4, + "learning_rate": 0.0003, "train_batch_size": 8, - "metrics": ["perplexity", "train_runtime", "train_samples_per_second"], + "metrics": [ + "perplexity", + "train_runtime", + "train_samples_per_second" + ], "extra_arguments": [ "--bf16 True", "--gradient_accumulation_steps 2", @@ -198,9 +222,13 @@ "eval_batch_size": 4, "distribution": { "multi_card": { - "learning_rate": 3e-4, + "learning_rate": 0.0003, "train_batch_size": 8, - "metrics": ["perplexity", "train_runtime", "train_samples_per_second"], + "metrics": [ + "perplexity", + "train_runtime", + "train_samples_per_second" + ], "extra_arguments": [ "--bf16", "--gradient_accumulation_steps 2", @@ -234,9 +262,12 @@ "eval_batch_size": 1, "distribution": { "multi_card": { - "learning_rate": 1e-4, + "learning_rate": 0.0001, "train_batch_size": 4, - "metrics": ["train_runtime", "train_samples_per_second"], + "metrics": [ + "train_runtime", + "train_samples_per_second" + ], "extra_arguments": [ "--bf16 True", "--gradient_accumulation_steps 2", @@ -264,9 +295,12 @@ "eval_batch_size": 1, "distribution": { "multi_card": { - "learning_rate": 5e-4, + "learning_rate": 0.0005, "train_batch_size": 1, - "metrics": ["train_runtime", "train_samples_per_second"], + "metrics": [ + "train_runtime", + "train_samples_per_second" + ], "extra_arguments": [ "--logging_steps 1", "--lora_r 8", @@ -294,9 +328,12 @@ "eval_batch_size": 1, "distribution": { "multi_card": { - "learning_rate": 5e-4, + "learning_rate": 0.0005, "train_batch_size": 1, - "metrics": ["train_runtime", "train_samples_per_second"], + "metrics": [ + "train_runtime", + "train_samples_per_second" + ], "extra_arguments": [ "--logging_steps 1", "--lora_r 8", @@ -319,9 +356,12 @@ "eval_batch_size": 1, "distribution": { "multi_card": { - "learning_rate": 5e-4, + "learning_rate": 0.0005, "train_batch_size": 8, - "metrics": ["train_runtime", "train_samples_per_second"], + "metrics": [ + "train_runtime", + "train_samples_per_second" + ], "extra_arguments": [ "--lora_r 8", "--lora_alpha 16", @@ -346,9 +386,13 @@ "eval_batch_size": 1, "distribution": { "multi_card": { - "learning_rate": 5e-4, + "learning_rate": 0.0005, "train_batch_size": 1, - "metrics": ["perplexity", "train_runtime", "train_samples_per_second"], + "metrics": [ + "perplexity", + "train_runtime", + "train_samples_per_second" + ], "extra_arguments": [ "--num_virtual_tokens 8", "--max_seq_length 64", @@ -369,9 +413,13 @@ "eval_batch_size": 1, "distribution": { "multi_card": { - "learning_rate": 5e-4, + "learning_rate": 0.0005, "train_batch_size": 1, - "metrics": ["perplexity", "train_runtime", "train_samples_per_second"], + "metrics": [ + "perplexity", + "train_runtime", + "train_samples_per_second" + ], "extra_arguments": [ "--num_virtual_tokens 8", "--max_seq_length 64", @@ -392,9 +440,13 @@ "eval_batch_size": 1, "distribution": { "multi_card": { - "learning_rate": 5e-4, + "learning_rate": 0.0005, "train_batch_size": 1, - "metrics": ["perplexity", "train_runtime", "train_samples_per_second"], + "metrics": [ + "perplexity", + "train_runtime", + "train_samples_per_second" + ], "extra_arguments": [ "--num_virtual_tokens 8", "--max_seq_length 64", @@ -415,9 +467,13 @@ "eval_batch_size": 4, "distribution": { "multi_card": { - "learning_rate": 3e-4, + "learning_rate": 0.0003, "train_batch_size": 16, - "metrics": ["perplexity", "train_runtime", "train_samples_per_second"], + "metrics": [ + "perplexity", + "train_runtime", + "train_samples_per_second" + ], "extra_arguments": [ "--bf16", "--gradient_accumulation_steps 1", @@ -450,9 +506,13 @@ "eval_batch_size": 4, "distribution": { "multi_card": { - "learning_rate": 3e-4, + "learning_rate": 0.0003, "train_batch_size": 8, - "metrics": ["perplexity", "train_runtime", "train_samples_per_second"], + "metrics": [ + "perplexity", + "train_runtime", + "train_samples_per_second" + ], "extra_arguments": [ "--bf16", "--gradient_accumulation_steps 2", @@ -481,9 +541,13 @@ "eval_batch_size": 4, "distribution": { "multi_card": { - "learning_rate": 3e-4, + "learning_rate": 0.0003, "train_batch_size": 8, - "metrics": ["perplexity", "train_runtime", "train_samples_per_second"], + "metrics": [ + "perplexity", + "train_runtime", + "train_samples_per_second" + ], "extra_arguments": [ "--bf16", "--gradient_accumulation_steps 2", @@ -520,9 +584,13 @@ "eval_batch_size": 4, "distribution": { "multi_card": { - "learning_rate": 1e-2, + "learning_rate": 0.01, "train_batch_size": 8, - "metrics": ["perplexity", "train_runtime", "train_samples_per_second"], + "metrics": [ + "perplexity", + "train_runtime", + "train_samples_per_second" + ], "extra_arguments": [ "--bf16", "--gradient_accumulation_steps 1", @@ -551,9 +619,13 @@ "eval_batch_size": 4, "distribution": { "multi_card": { - "learning_rate": 3e-4, + "learning_rate": 0.0003, "train_batch_size": 8, - "metrics": ["perplexity", "train_runtime", "train_samples_per_second"], + "metrics": [ + "perplexity", + "train_runtime", + "train_samples_per_second" + ], "extra_arguments": [ "--bf16", "--gradient_accumulation_steps 2", @@ -582,9 +654,631 @@ "eval_batch_size": 4, "distribution": { "deepspeed": { - "learning_rate": 3e-4, + "learning_rate": 0.0003, "train_batch_size": 8, - "metrics": ["perplexity", "train_runtime", "train_samples_per_second"], + "metrics": [ + "perplexity", + "train_runtime", + "train_samples_per_second" + ], + "extra_arguments": [ + "--bf16 True", + "--gradient_accumulation_steps 4", + "--logging_steps 1", + "--validation_split_percentage 10", + "--lora_rank 8", + "--lora_alpha 16", + "--lora_dropout 0.05", + "--lora_target_modules q_proj v_proj", + "--dataset_concatenation", + "--max_seq_length 2048", + "--pipelining_fwd_bwd", + "--throughput_warmup_steps 3", + "--use_lazy_mode", + "--context_parallel_size 4", + "--deepspeed tests/configs/deepspeed_zero_1.json" + ] + } + } + } + }, + "gaudi3": { + "databricks/databricks-dolly-15k": { + "num_train_epochs": 1, + "eval_batch_size": 8, + "distribution": { + "single_card": { + "learning_rate": 0.0002, + "train_batch_size": 16, + "metrics": [ + "perplexity", + "train_runtime", + "train_samples_per_second" + ], + "extra_arguments": [ + "--bf16", + "--gradient_accumulation_steps 1", + "--eval_strategy no", + "--save_strategy no", + "--warmup_ratio 0.03", + "--lr_scheduler_type constant", + "--max_grad_norm 0.3", + "--logging_steps 1", + "--use_hpu_graphs_for_inference", + "--lora_rank 8", + "--lora_alpha 16", + "--lora_dropout 0.1", + "--lora_target_modules q_proj v_proj", + "--dataset_concatenation", + "--low_cpu_mem_usage True", + "--adam_epsilon 1e-08", + "--validation_split_percentage 20", + "--attn_softmax_bf16", + "--max_steps 100", + "--input_column_name context", + "--output_column_name response" + ] + } + } + }, + "tatsu-lab/alpaca": { + "num_train_epochs": 3, + "eval_batch_size": 4, + "distribution": { + "multi_card": { + "learning_rate": 0.0003, + "train_batch_size": 8, + "metrics": [ + "perplexity", + "train_runtime", + "train_samples_per_second" + ], + "extra_arguments": [ + "--bf16", + "--gradient_accumulation_steps 2", + "--eval_strategy no", + "--save_strategy no", + "--warmup_ratio 0.03", + "--lr_scheduler_type constant", + "--max_grad_norm 0.3", + "--logging_steps 1", + "--use_hpu_graphs_for_inference", + "--lora_rank 8", + "--lora_alpha 16", + "--lora_dropout 0.05", + "--lora_target_modules q_proj v_proj", + "--dataset_concatenation", + "--max_seq_length 512", + "--low_cpu_mem_usage True", + "--adam_epsilon 1e-08", + "--ddp_bucket_cap_mb 50", + "--validation_split_percentage 10", + "--attn_softmax_bf16" + ] + } + } + }, + "mamamiya405/finred": { + "num_train_epochs": 3, + "eval_batch_size": 4, + "distribution": { + "multi_card": { + "learning_rate": 0.0003, + "train_batch_size": 8, + "metrics": [ + "perplexity", + "train_runtime", + "train_samples_per_second" + ], + "extra_arguments": [ + "--bf16", + "--gradient_accumulation_steps 2", + "--eval_strategy no", + "--save_strategy no", + "--warmup_ratio 0.03", + "--lr_scheduler_type constant", + "--max_grad_norm 0.3", + "--logging_steps 1", + "--use_hpu_graphs_for_inference", + "--lora_rank 8", + "--lora_alpha 16", + "--lora_dropout 0.05", + "--lora_target_modules q_proj v_proj", + "--max_seq_length 512", + "--low_cpu_mem_usage True", + "--adam_epsilon 1e-08", + "--ddp_bucket_cap_mb 50", + "--validation_split_percentage 10", + "--attn_softmax_bf16" + ] + } + } + }, + "tatsu-lab/alpaca_fsdpcompile": { + "num_train_epochs": 1, + "eval_batch_size": 1, + "distribution": { + "multi_card": { + "learning_rate": 0.0003, + "train_batch_size": 8, + "metrics": [ + "perplexity", + "train_runtime", + "train_samples_per_second" + ], + "extra_arguments": [ + "--bf16 True", + "--gradient_accumulation_steps 2", + "--eval_strategy no", + "--save_strategy no", + "--warmup_ratio 0.03", + "--lr_scheduler_type constant", + "--max_grad_norm 0.3", + "--logging_steps 1", + "--lora_rank 8", + "--lora_alpha 16", + "--lora_dropout 0.05", + "--lora_target_modules q_proj v_proj", + "--dataset_concatenation", + "--max_seq_length 512", + "--low_cpu_mem_usage True", + "--adam_epsilon 1e-08", + "--ddp_bucket_cap_mb 50", + "--validation_split_percentage 10", + "--attn_softmax_bf16", + "--pipelining_fwd_bwd False", + "--fsdp auto_wrap", + "--torch_compile_backend hpu_backend", + "--torch_compile", + "--fsdp_config examples/language-modeling/fsdp_config.json" + ] + } + } + }, + "llama-adapter": { + "num_train_epochs": 3, + "eval_batch_size": 4, + "distribution": { + "multi_card": { + "learning_rate": 0.0003, + "train_batch_size": 8, + "metrics": [ + "perplexity", + "train_runtime", + "train_samples_per_second" + ], + "extra_arguments": [ + "--bf16", + "--gradient_accumulation_steps 2", + "--eval_strategy no", + "--save_strategy no", + "--warmup_ratio 0.03", + "--lr_scheduler_type constant", + "--max_grad_norm 0.3", + "--logging_steps 1", + "--use_hpu_graphs_for_inference", + "--lora_rank 8", + "--lora_alpha 16", + "--lora_dropout 0.05", + "--lora_target_modules q_proj v_proj", + "--dataset_concatenation", + "--max_seq_length 512", + "--low_cpu_mem_usage True", + "--adam_epsilon 1e-08", + "--ddp_bucket_cap_mb 50", + "--validation_split_percentage 10", + "--attn_softmax_bf16", + "--adapter_layers 2", + "--adapter_len 4", + "--peft_type llama-adapter" + ] + } + } + }, + "trl-sft": { + "num_train_epochs": 1, + "eval_batch_size": 1, + "distribution": { + "multi_card": { + "learning_rate": 0.0001, + "train_batch_size": 4, + "metrics": [ + "train_runtime", + "train_samples_per_second" + ], + "extra_arguments": [ + "--bf16 True", + "--gradient_accumulation_steps 2", + "--eval_strategy no", + "--save_strategy no", + "--warmup_ratio 0.03", + "--lr_scheduler_type constant", + "--max_grad_norm 0.3", + "--logging_steps 1", + "--lora_r 8", + "--lora_alpha 16", + "--lora_dropout 0.05", + "--lora_target_modules q_proj v_proj", + "--max_seq_length 1024", + "--optim paged_adamw_32bit", + "--weight_decay 0.05", + "--report_to none", + "--max_steps 100" + ] + } + } + }, + "trl-dpo": { + "num_train_epochs": 1, + "eval_batch_size": 1, + "distribution": { + "multi_card": { + "learning_rate": 0.0005, + "train_batch_size": 1, + "metrics": [ + "train_runtime", + "train_samples_per_second" + ], + "extra_arguments": [ + "--logging_steps 1", + "--lora_r 8", + "--lora_alpha 16", + "--lora_dropout 0.05", + "--lora_target_modules q_proj v_proj k_proj out_proj fc_in fc_out wte", + "--max_length 1024", + "--max_prompt_length 512", + "--report_to none", + "--max_steps 100", + "--eval_steps 200", + "--lr_scheduler_type cosine", + "--warmup_steps 0", + "--weight_decay 0.05", + "--optimizer_type paged_adamw_32bit", + "--beta 0.1", + "--gradient_accumulation_steps 4", + "--sanity_check" + ] + } + } + }, + "trl-reward": { + "num_train_epochs": 1, + "eval_batch_size": 1, + "distribution": { + "multi_card": { + "learning_rate": 0.0005, + "train_batch_size": 1, + "metrics": [ + "train_runtime", + "train_samples_per_second" + ], + "extra_arguments": [ + "--logging_steps 1", + "--lora_r 8", + "--lora_alpha 16", + "--lora_dropout 0.05", + "--lora_target_modules q_proj v_proj k_proj out_proj fc_in fc_out wte", + "--max_length 1024", + "--eval_steps 200", + "--lr_scheduler_type cosine", + "--weight_decay 0.05", + "--gradient_accumulation_steps 4", + "--train_subset 500", + "--eval_subset 100" + ] + } + } + }, + "trl-ppo": { + "num_train_epochs": 1, + "eval_batch_size": 1, + "distribution": { + "multi_card": { + "learning_rate": 0.0005, + "train_batch_size": 8, + "metrics": [ + "train_runtime", + "train_samples_per_second" + ], + "extra_arguments": [ + "--lora_r 8", + "--lora_alpha 16", + "--lora_dropout 0.05", + "--reward_model_name HuggingFaceH4/tiny-random-LlamaForSequenceClassification", + "--lora_target_modules q_proj v_proj k_proj out_proj fc_in fc_out wte", + "--max_train_samples 1000", + "--use_habana", + "--ppo_epochs 1", + "--batched_gen True", + "--mini_batch_size 1", + "--output_max_length 128", + "--input_max_length 128", + "--learning_rate 1.4e-5", + "--early_stopping" + ] + } + } + }, + "prompt-tuning": { + "num_train_epochs": 20, + "eval_batch_size": 1, + "distribution": { + "multi_card": { + "learning_rate": 0.0005, + "train_batch_size": 1, + "metrics": [ + "perplexity", + "train_runtime", + "train_samples_per_second" + ], + "extra_arguments": [ + "--num_virtual_tokens 8", + "--max_seq_length 64", + "--logging_steps 1", + "--report_to none", + "--max_steps 100", + "--peft_type prompt_tuning", + "--lr_scheduler_type cosine", + "--warmup_steps 0", + "--weight_decay 0.05", + "--gradient_accumulation_steps 1" + ] + } + } + }, + "prefix-tuning": { + "num_train_epochs": 20, + "eval_batch_size": 1, + "distribution": { + "multi_card": { + "learning_rate": 0.0005, + "train_batch_size": 1, + "metrics": [ + "perplexity", + "train_runtime", + "train_samples_per_second" + ], + "extra_arguments": [ + "--num_virtual_tokens 8", + "--max_seq_length 64", + "--logging_steps 1", + "--report_to none", + "--max_steps 100", + "--peft_type prefix_tuning", + "--lr_scheduler_type cosine", + "--warmup_steps 0", + "--weight_decay 0.05", + "--gradient_accumulation_steps 1" + ] + } + } + }, + "p-tuning": { + "num_train_epochs": 20, + "eval_batch_size": 1, + "distribution": { + "multi_card": { + "learning_rate": 0.0005, + "train_batch_size": 1, + "metrics": [ + "perplexity", + "train_runtime", + "train_samples_per_second" + ], + "extra_arguments": [ + "--num_virtual_tokens 8", + "--max_seq_length 64", + "--logging_steps 1", + "--report_to none", + "--max_steps 100", + "--peft_type p_tuning", + "--lr_scheduler_type cosine", + "--warmup_steps 0", + "--weight_decay 0.05", + "--gradient_accumulation_steps 1" + ] + } + } + }, + "tatsu-lab/alpaca_fp8": { + "num_train_epochs": 3, + "eval_batch_size": 4, + "distribution": { + "multi_card": { + "learning_rate": 0.0003, + "train_batch_size": 16, + "metrics": [ + "perplexity", + "train_runtime", + "train_samples_per_second" + ], + "extra_arguments": [ + "--bf16", + "--gradient_accumulation_steps 1", + "--eval_strategy no", + "--save_strategy no", + "--warmup_ratio 0.03", + "--lr_scheduler_type constant", + "--logging_steps 40", + "--lora_rank 8", + "--lora_alpha 16", + "--lora_dropout 0.05", + "--lora_target_modules q_proj v_proj", + "--dataset_concatenation", + "--max_seq_length 512", + "--low_cpu_mem_usage True", + "--adam_epsilon 1e-08", + "--ddp_bucket_cap_mb 50", + "--validation_split_percentage 10", + "--pipelining_fwd_bwd", + "--throughput_warmup_steps 18", + "--use_lazy_mode", + "--max_grad_norm 0.3", + "--fp8" + ] + } + } + }, + "ia3": { + "num_train_epochs": 3, + "eval_batch_size": 4, + "distribution": { + "multi_card": { + "learning_rate": 0.0003, + "train_batch_size": 8, + "metrics": [ + "perplexity", + "train_runtime", + "train_samples_per_second" + ], + "extra_arguments": [ + "--bf16", + "--gradient_accumulation_steps 2", + "--eval_strategy no", + "--save_strategy no", + "--warmup_ratio 0.03", + "--lr_scheduler_type constant", + "--max_grad_norm 0.3", + "--logging_steps 1", + "--use_hpu_graphs_for_inference", + "--ia3_target_modules q_proj v_proj", + "--dataset_concatenation", + "--max_seq_length 512", + "--low_cpu_mem_usage True", + "--adam_epsilon 1e-08", + "--ddp_bucket_cap_mb 50", + "--validation_split_percentage 10", + "--attn_softmax_bf16", + "--peft_type ia3" + ] + } + } + }, + "adalora": { + "num_train_epochs": 3, + "eval_batch_size": 4, + "distribution": { + "multi_card": { + "learning_rate": 0.0003, + "train_batch_size": 8, + "metrics": [ + "perplexity", + "train_runtime", + "train_samples_per_second" + ], + "extra_arguments": [ + "--bf16", + "--gradient_accumulation_steps 2", + "--eval_strategy no", + "--save_strategy no", + "--warmup_ratio 0.03", + "--lr_scheduler_type constant", + "--max_grad_norm 0.3", + "--logging_steps 1", + "--use_hpu_graphs_for_inference", + "--lora_alpha 16", + "--lora_dropout 0.05", + "--lora_target_modules q_proj v_proj", + "--adalora_init_r 12", + "--adalora_target_r 4", + "--adalora_tinit 50", + "--adalora_tfinal 500", + "--adalora_delta_t 100", + "--adalora_orth_reg_weight 0.5", + "--dataset_concatenation", + "--max_seq_length 512", + "--low_cpu_mem_usage True", + "--adam_epsilon 1e-08", + "--ddp_bucket_cap_mb 50", + "--validation_split_percentage 10", + "--attn_softmax_bf16", + "--peft_type adalora" + ] + } + } + }, + "vera": { + "num_train_epochs": 3, + "eval_batch_size": 4, + "distribution": { + "multi_card": { + "learning_rate": 0.01, + "train_batch_size": 8, + "metrics": [ + "perplexity", + "train_runtime", + "train_samples_per_second" + ], + "extra_arguments": [ + "--bf16", + "--gradient_accumulation_steps 1", + "--eval_strategy no", + "--save_strategy no", + "--warmup_ratio 0.03", + "--lr_scheduler_type constant", + "--max_grad_norm 0.3", + "--logging_steps 1", + "--use_hpu_graphs_for_inference", + "--vera_target_modules q_proj v_proj", + "--dataset_concatenation", + "--max_seq_length 512", + "--low_cpu_mem_usage True", + "--adam_epsilon 1e-08", + "--ddp_bucket_cap_mb 50", + "--validation_split_percentage 10", + "--attn_softmax_bf16", + "--peft_type vera" + ] + } + } + }, + "ln_tuning": { + "num_train_epochs": 3, + "eval_batch_size": 4, + "distribution": { + "multi_card": { + "learning_rate": 0.0003, + "train_batch_size": 8, + "metrics": [ + "perplexity", + "train_runtime", + "train_samples_per_second" + ], + "extra_arguments": [ + "--bf16", + "--gradient_accumulation_steps 2", + "--eval_strategy no", + "--save_strategy no", + "--warmup_ratio 0.03", + "--lr_scheduler_type constant", + "--max_grad_norm 0.3", + "--logging_steps 1", + "--use_hpu_graphs_for_inference", + "--ln_target_module input_layernorm post_attention_layernorm norm", + "--dataset_concatenation", + "--max_seq_length 512", + "--low_cpu_mem_usage True", + "--adam_epsilon 1e-08", + "--ddp_bucket_cap_mb 50", + "--validation_split_percentage 10", + "--attn_softmax_bf16", + "--peft_type ln_tuning" + ] + } + } + }, + "tatsu-lab/alpaca_cp": { + "num_train_epochs": 1, + "eval_batch_size": 4, + "distribution": { + "deepspeed": { + "learning_rate": 0.0003, + "train_batch_size": 8, + "metrics": [ + "perplexity", + "train_runtime", + "train_samples_per_second" + ], "extra_arguments": [ "--bf16 True", "--gradient_accumulation_steps 4", @@ -606,4 +1300,4 @@ } } } -} +} \ No newline at end of file diff --git a/tests/configs/examples/llava_1_5_7b_hf.json b/tests/configs/examples/llava_1_5_7b_hf.json index 774ca979e0..b378a77213 100644 --- a/tests/configs/examples/llava_1_5_7b_hf.json +++ b/tests/configs/examples/llava_1_5_7b_hf.json @@ -5,9 +5,51 @@ "eval_batch_size": 4, "distribution": { "multi_card": { - "learning_rate": 5e-5, + "learning_rate": 5e-05, "train_batch_size": 2, - "metrics": ["eval_accuracy", "train_runtime", "train_samples_per_second"], + "metrics": [ + "eval_accuracy", + "train_runtime", + "train_samples_per_second" + ], + "extra_arguments": [ + "--bf16", + "--gradient_accumulation_steps 8", + "--eval_strategy no", + "--save_strategy no", + "--warmup_steps 50", + "--lr_scheduler_type constant", + "--max_grad_norm 0.3", + "--logging_steps 1", + "--use_hpu_graphs_for_inference", + "--lora_rank 8", + "--lora_alpha 8", + "--lora_dropout 0.1", + "--lora_target_modules '.*(language_model).*(down_proj|gate_proj|up_proj|k_proj|q_proj|v_proj|o_proj).*$'", + "--low_cpu_mem_usage True", + "--adam_epsilon 1e-08", + "--input_column_name image query", + "--output_column_name answers", + "--remove_unused_columns False", + "--max_seq_length 512" + ] + } + } + } + }, + "gaudi3": { + "image2text_lora_finetune": { + "num_train_epochs": 1, + "eval_batch_size": 4, + "distribution": { + "multi_card": { + "learning_rate": 5e-05, + "train_batch_size": 2, + "metrics": [ + "eval_accuracy", + "train_runtime", + "train_samples_per_second" + ], "extra_arguments": [ "--bf16", "--gradient_accumulation_steps 8", @@ -33,4 +75,4 @@ } } } -} +} \ No newline at end of file diff --git a/tests/configs/examples/protst_esm1b_for_sequential_classification.json b/tests/configs/examples/protst_esm1b_for_sequential_classification.json index d80c1dd57d..808354d887 100644 --- a/tests/configs/examples/protst_esm1b_for_sequential_classification.json +++ b/tests/configs/examples/protst_esm1b_for_sequential_classification.json @@ -5,9 +5,39 @@ "eval_batch_size": 4, "distribution": { "multi_card": { - "learning_rate": 5e-5, + "learning_rate": 5e-05, "train_batch_size": 32, - "metrics": ["eval_accuracy", "train_runtime", "train_samples_per_second"], + "metrics": [ + "eval_accuracy", + "train_runtime", + "train_samples_per_second" + ], + "extra_arguments": [ + "--save_strategy no", + "--tokenizer_name facebook/esm1b_t33_650M_UR50S", + "--use_hpu_graphs_for_inference", + "--use_hpu_graphs_for_training", + "--trust_remote_code", + "--torch_dtype bfloat16", + "--label_names labels" + ] + } + } + } + }, + "gaudi3": { + "prost-sequence-classification": { + "num_train_epochs": 1, + "eval_batch_size": 4, + "distribution": { + "multi_card": { + "learning_rate": 5e-05, + "train_batch_size": 32, + "metrics": [ + "eval_accuracy", + "train_runtime", + "train_samples_per_second" + ], "extra_arguments": [ "--save_strategy no", "--tokenizer_name facebook/esm1b_t33_650M_UR50S", @@ -21,4 +51,4 @@ } } } -} +} \ No newline at end of file diff --git a/tests/configs/examples/roberta_base.json b/tests/configs/examples/roberta_base.json index 8409805d8c..ac8477c654 100644 --- a/tests/configs/examples/roberta_base.json +++ b/tests/configs/examples/roberta_base.json @@ -5,18 +5,26 @@ "eval_batch_size": 8, "distribution": { "single_card": { - "learning_rate": 3e-5, + "learning_rate": 3e-05, "train_batch_size": 12, - "metrics": ["eval_f1", "train_runtime", "train_samples_per_second"], + "metrics": [ + "eval_f1", + "train_runtime", + "train_samples_per_second" + ], "extra_arguments": [ "--max_seq_length 384", "--use_hpu_graphs_for_inference" ] }, "multi_card": { - "learning_rate": 8e-5, + "learning_rate": 8e-05, "train_batch_size": 12, - "metrics": ["eval_f1", "train_runtime", "train_samples_per_second"], + "metrics": [ + "eval_f1", + "train_runtime", + "train_samples_per_second" + ], "extra_arguments": [ "--max_seq_length 384", "--use_hpu_graphs_for_inference" @@ -29,9 +37,13 @@ "eval_batch_size": 8, "distribution": { "multi_card": { - "learning_rate": 5e-5, + "learning_rate": 5e-05, "train_batch_size": 24, - "metrics": ["perplexity", "train_runtime", "train_samples_per_second"], + "metrics": [ + "perplexity", + "train_runtime", + "train_samples_per_second" + ], "extra_arguments": [ "--dataset_config_name wikitext-2-raw-v1", "--use_hpu_graphs_for_inference", @@ -47,18 +59,26 @@ "eval_batch_size": 8, "distribution": { "single_card": { - "learning_rate": 7e-5, + "learning_rate": 7e-05, "train_batch_size": 64, - "metrics": ["eval_f1", "train_runtime", "train_samples_per_second"], + "metrics": [ + "eval_f1", + "train_runtime", + "train_samples_per_second" + ], "extra_arguments": [ "--max_seq_length 384", "--use_hpu_graphs_for_inference" ] }, "multi_card": { - "learning_rate": 2e-4, + "learning_rate": 0.0002, "train_batch_size": 64, - "metrics": ["eval_f1", "train_runtime", "train_samples_per_second"], + "metrics": [ + "eval_f1", + "train_runtime", + "train_samples_per_second" + ], "extra_arguments": [ "--max_seq_length 384", "--use_hpu_graphs_for_inference" @@ -71,9 +91,67 @@ "eval_batch_size": 8, "distribution": { "multi_card": { - "learning_rate": 8e-5, + "learning_rate": 8e-05, "train_batch_size": 32, - "metrics": ["perplexity", "train_runtime", "train_samples_per_second"], + "metrics": [ + "perplexity", + "train_runtime", + "train_samples_per_second" + ], + "extra_arguments": [ + "--dataset_config_name wikitext-2-raw-v1", + "--use_hpu_graphs_for_inference", + "--ddp_find_unused_parameters True" + ] + } + } + } + }, + "gaudi3": { + "squad": { + "num_train_epochs": 1, + "eval_batch_size": 8, + "distribution": { + "single_card": { + "learning_rate": 7e-05, + "train_batch_size": 64, + "metrics": [ + "eval_f1", + "train_runtime", + "train_samples_per_second" + ], + "extra_arguments": [ + "--max_seq_length 384", + "--use_hpu_graphs_for_inference" + ] + }, + "multi_card": { + "learning_rate": 0.0002, + "train_batch_size": 64, + "metrics": [ + "eval_f1", + "train_runtime", + "train_samples_per_second" + ], + "extra_arguments": [ + "--max_seq_length 384", + "--use_hpu_graphs_for_inference" + ] + } + } + }, + "wikitext": { + "num_train_epochs": 2, + "eval_batch_size": 8, + "distribution": { + "multi_card": { + "learning_rate": 8e-05, + "train_batch_size": 32, + "metrics": [ + "perplexity", + "train_runtime", + "train_samples_per_second" + ], "extra_arguments": [ "--dataset_config_name wikitext-2-raw-v1", "--use_hpu_graphs_for_inference", @@ -83,4 +161,4 @@ } } } -} +} \ No newline at end of file diff --git a/tests/configs/examples/roberta_large.json b/tests/configs/examples/roberta_large.json index 90b6dd5dce..72b7989a13 100755 --- a/tests/configs/examples/roberta_large.json +++ b/tests/configs/examples/roberta_large.json @@ -5,18 +5,26 @@ "eval_batch_size": 8, "distribution": { "single_card": { - "learning_rate": 3e-5, + "learning_rate": 3e-05, "train_batch_size": 12, - "metrics": ["eval_f1", "train_runtime", "train_samples_per_second"], + "metrics": [ + "eval_f1", + "train_runtime", + "train_samples_per_second" + ], "extra_arguments": [ "--max_seq_length 384", "--use_hpu_graphs_for_inference" ] }, "multi_card": { - "learning_rate": 8e-5, + "learning_rate": 8e-05, "train_batch_size": 12, - "metrics": ["eval_f1", "train_runtime", "train_samples_per_second"], + "metrics": [ + "eval_f1", + "train_runtime", + "train_samples_per_second" + ], "extra_arguments": [ "--max_seq_length 384", "--use_hpu_graphs_for_inference" @@ -29,9 +37,13 @@ "eval_batch_size": 8, "distribution": { "multi_card": { - "learning_rate": 5e-5, + "learning_rate": 5e-05, "train_batch_size": 8, - "metrics": ["perplexity", "train_runtime", "train_samples_per_second"], + "metrics": [ + "perplexity", + "train_runtime", + "train_samples_per_second" + ], "extra_arguments": [ "--dataset_config_name wikitext-2-raw-v1", "--use_hpu_graphs_for_inference", @@ -47,18 +59,26 @@ "eval_batch_size": 8, "distribution": { "single_card": { - "learning_rate": 3e-5, + "learning_rate": 3e-05, "train_batch_size": 32, - "metrics": ["eval_f1", "train_runtime", "train_samples_per_second"], + "metrics": [ + "eval_f1", + "train_runtime", + "train_samples_per_second" + ], "extra_arguments": [ "--max_seq_length 384", "--use_hpu_graphs_for_inference" ] }, "multi_card": { - "learning_rate": 7e-5, + "learning_rate": 7e-05, "train_batch_size": 32, - "metrics": ["eval_f1", "train_runtime", "train_samples_per_second"], + "metrics": [ + "eval_f1", + "train_runtime", + "train_samples_per_second" + ], "extra_arguments": [ "--max_seq_length 384", "--use_hpu_graphs_for_inference" @@ -71,9 +91,67 @@ "eval_batch_size": 8, "distribution": { "multi_card": { - "learning_rate": 7e-5, + "learning_rate": 7e-05, "train_batch_size": 16, - "metrics": ["perplexity", "train_runtime", "train_samples_per_second"], + "metrics": [ + "perplexity", + "train_runtime", + "train_samples_per_second" + ], + "extra_arguments": [ + "--dataset_config_name wikitext-2-raw-v1", + "--use_hpu_graphs_for_inference", + "--ddp_find_unused_parameters True" + ] + } + } + } + }, + "gaudi3": { + "squad": { + "num_train_epochs": 1, + "eval_batch_size": 8, + "distribution": { + "single_card": { + "learning_rate": 3e-05, + "train_batch_size": 32, + "metrics": [ + "eval_f1", + "train_runtime", + "train_samples_per_second" + ], + "extra_arguments": [ + "--max_seq_length 384", + "--use_hpu_graphs_for_inference" + ] + }, + "multi_card": { + "learning_rate": 7e-05, + "train_batch_size": 32, + "metrics": [ + "eval_f1", + "train_runtime", + "train_samples_per_second" + ], + "extra_arguments": [ + "--max_seq_length 384", + "--use_hpu_graphs_for_inference" + ] + } + } + }, + "wikitext": { + "num_train_epochs": 2, + "eval_batch_size": 8, + "distribution": { + "multi_card": { + "learning_rate": 7e-05, + "train_batch_size": 16, + "metrics": [ + "perplexity", + "train_runtime", + "train_samples_per_second" + ], "extra_arguments": [ "--dataset_config_name wikitext-2-raw-v1", "--use_hpu_graphs_for_inference", @@ -83,4 +161,4 @@ } } } -} +} \ No newline at end of file diff --git a/tests/configs/examples/swin_base_patch4_window7_224_in22k.json b/tests/configs/examples/swin_base_patch4_window7_224_in22k.json index 3f6a6c8693..e02bfe6610 100644 --- a/tests/configs/examples/swin_base_patch4_window7_224_in22k.json +++ b/tests/configs/examples/swin_base_patch4_window7_224_in22k.json @@ -5,9 +5,13 @@ "eval_batch_size": 64, "distribution": { "single_card": { - "learning_rate": 3e-5, + "learning_rate": 3e-05, "train_batch_size": 64, - "metrics": ["eval_accuracy", "train_runtime", "train_samples_per_second"], + "metrics": [ + "eval_accuracy", + "train_runtime", + "train_samples_per_second" + ], "extra_arguments": [ "--remove_unused_columns False", "--image_column_name img", @@ -20,9 +24,13 @@ ] }, "multi_card": { - "learning_rate": 2e-4, + "learning_rate": 0.0002, "train_batch_size": 64, - "metrics": ["eval_accuracy", "train_runtime", "train_samples_per_second"], + "metrics": [ + "eval_accuracy", + "train_runtime", + "train_samples_per_second" + ], "extra_arguments": [ "--remove_unused_columns False", "--image_column_name img", @@ -43,9 +51,13 @@ "eval_batch_size": 64, "distribution": { "single_card": { - "learning_rate": 6e-5, + "learning_rate": 6e-05, "train_batch_size": 160, - "metrics": ["eval_accuracy", "train_runtime", "train_samples_per_second"], + "metrics": [ + "eval_accuracy", + "train_runtime", + "train_samples_per_second" + ], "extra_arguments": [ "--remove_unused_columns False", "--image_column_name img", @@ -58,9 +70,59 @@ ] }, "multi_card": { - "learning_rate": 2e-4, + "learning_rate": 0.0002, "train_batch_size": 160, - "metrics": ["eval_accuracy", "train_runtime", "train_samples_per_second"], + "metrics": [ + "eval_accuracy", + "train_runtime", + "train_samples_per_second" + ], + "extra_arguments": [ + "--remove_unused_columns False", + "--image_column_name img", + "--seed 1337", + "--use_hpu_graphs_for_inference", + "--ignore_mismatched_sizes", + "--dataloader_num_workers 1", + "--pipelining_fwd_bwd True", + "--non_blocking_data_copy True" + ] + } + } + } + }, + "gaudi3": { + "cifar10": { + "num_train_epochs": 1, + "eval_batch_size": 64, + "distribution": { + "single_card": { + "learning_rate": 6e-05, + "train_batch_size": 160, + "metrics": [ + "eval_accuracy", + "train_runtime", + "train_samples_per_second" + ], + "extra_arguments": [ + "--remove_unused_columns False", + "--image_column_name img", + "--seed 1337", + "--use_hpu_graphs_for_inference", + "--ignore_mismatched_sizes", + "--dataloader_num_workers 1", + "--pipelining_fwd_bwd True", + "--non_blocking_data_copy True" + ] + }, + "multi_card": { + "learning_rate": 0.0002, + "train_batch_size": 160, + "metrics": [ + "eval_accuracy", + "train_runtime", + "train_samples_per_second" + ], "extra_arguments": [ "--remove_unused_columns False", "--image_column_name img", @@ -75,4 +137,4 @@ } } } -} +} \ No newline at end of file diff --git a/tests/configs/examples/t5_small.json b/tests/configs/examples/t5_small.json index 38b1b4f11f..d42f257737 100644 --- a/tests/configs/examples/t5_small.json +++ b/tests/configs/examples/t5_small.json @@ -5,9 +5,14 @@ "eval_batch_size": 4, "distribution": { "multi_card": { - "learning_rate": 5e-5, + "learning_rate": 5e-05, "train_batch_size": 4, - "metrics": ["eval_rougeLsum", "eval_samples_per_second", "train_runtime", "train_samples_per_second"], + "metrics": [ + "eval_rougeLsum", + "eval_samples_per_second", + "train_runtime", + "train_samples_per_second" + ], "extra_arguments": [ "--dataset_config \"3.0.0\"", "--source_prefix \"summarize: \"", @@ -25,9 +30,13 @@ "eval_batch_size": 33, "distribution": { "multi_card": { - "learning_rate": 2e-4, + "learning_rate": 0.0002, "train_batch_size": 16, - "metrics": ["eval_f1", "train_runtime", "train_samples_per_second"], + "metrics": [ + "eval_f1", + "train_runtime", + "train_samples_per_second" + ], "extra_arguments": [ "--context_column context", "--question_column question", @@ -50,9 +59,14 @@ "eval_batch_size": 4, "distribution": { "multi_card": { - "learning_rate": 2e-4, + "learning_rate": 0.0002, "train_batch_size": 32, - "metrics": ["eval_rougeLsum", "eval_samples_per_second", "train_runtime", "train_samples_per_second"], + "metrics": [ + "eval_rougeLsum", + "eval_samples_per_second", + "train_runtime", + "train_samples_per_second" + ], "extra_arguments": [ "--dataset_config \"3.0.0\"", "--source_prefix \"summarize: \"", @@ -70,9 +84,13 @@ "eval_batch_size": 33, "distribution": { "multi_card": { - "learning_rate": 2e-3, + "learning_rate": 0.002, "train_batch_size": 64, - "metrics": ["eval_f1", "train_runtime", "train_samples_per_second"], + "metrics": [ + "eval_f1", + "train_runtime", + "train_samples_per_second" + ], "extra_arguments": [ "--context_column context", "--question_column question", @@ -93,9 +111,13 @@ "eval_batch_size": 33, "distribution": { "multi_card": { - "learning_rate": 2e-3, + "learning_rate": 0.002, "train_batch_size": 64, - "metrics": ["eval_f1", "train_runtime", "train_samples_per_second"], + "metrics": [ + "eval_f1", + "train_runtime", + "train_samples_per_second" + ], "extra_arguments": [ "--use_hpu_graphs_for_inference", "--use_hpu_graphs_for_training", @@ -112,9 +134,115 @@ "eval_batch_size": 4, "distribution": { "multi_card": { - "learning_rate": 2e-3, + "learning_rate": 0.002, "train_batch_size": 8, - "metrics": ["eval_accuracy", "train_runtime", "train_samples_per_second"], + "metrics": [ + "eval_accuracy", + "train_runtime", + "train_samples_per_second" + ], + "extra_arguments": [ + "--use_hpu_graphs_for_inference", + "--use_hpu_graphs_for_training", + "--max_source_length 256", + "--max_target_length 2", + "--max_train_samples 1000", + "--max_eval_samples 100", + "--bf16", + "--trust_remote_code True" + ] + } + } + } + }, + "gaudi3": { + "cnn_dailymail": { + "num_train_epochs": 1, + "eval_batch_size": 4, + "distribution": { + "multi_card": { + "learning_rate": 0.0002, + "train_batch_size": 32, + "metrics": [ + "eval_rougeLsum", + "eval_samples_per_second", + "train_runtime", + "train_samples_per_second" + ], + "extra_arguments": [ + "--dataset_config \"3.0.0\"", + "--source_prefix \"summarize: \"", + "--predict_with_generate", + "--ignore_pad_token_for_loss False", + "--pad_to_max_length", + "--use_hpu_graphs_for_inference", + "--save_strategy epoch" + ] + } + } + }, + "squad_v2": { + "num_train_epochs": 2, + "eval_batch_size": 33, + "distribution": { + "multi_card": { + "learning_rate": 0.002, + "train_batch_size": 64, + "metrics": [ + "eval_f1", + "train_runtime", + "train_samples_per_second" + ], + "extra_arguments": [ + "--context_column context", + "--question_column question", + "--answer_column answers", + "--version_2_with_negative", + "--max_seq_length 384", + "--predict_with_generate", + "--ignore_pad_token_for_loss False", + "--pad_to_max_length", + "--use_hpu_graphs_for_inference", + "--save_strategy epoch" + ] + } + } + }, + "multitask-prompt-tuning": { + "num_train_epochs": 1, + "eval_batch_size": 33, + "distribution": { + "multi_card": { + "learning_rate": 0.002, + "train_batch_size": 64, + "metrics": [ + "eval_f1", + "train_runtime", + "train_samples_per_second" + ], + "extra_arguments": [ + "--use_hpu_graphs_for_inference", + "--use_hpu_graphs_for_training", + "--max_source_length 256", + "--max_target_length 16", + "--bf16", + "--trust_remote_code True" + ] + } + } + }, + "poly-tuning": { + "num_train_epochs": 1, + "eval_batch_size": 4, + "distribution": { + "multi_card": { + "learning_rate": 0.002, + "train_batch_size": 8, + "metrics": [ + "eval_accuracy", + "train_runtime", + "train_samples_per_second" + ], "extra_arguments": [ "--use_hpu_graphs_for_inference", "--use_hpu_graphs_for_training", @@ -129,4 +257,4 @@ } } } -} +} \ No newline at end of file diff --git a/tests/configs/examples/vit_base_patch16_224_in21k.json b/tests/configs/examples/vit_base_patch16_224_in21k.json index 1071455031..bf9b6b297f 100644 --- a/tests/configs/examples/vit_base_patch16_224_in21k.json +++ b/tests/configs/examples/vit_base_patch16_224_in21k.json @@ -5,9 +5,13 @@ "eval_batch_size": 64, "distribution": { "single_card": { - "learning_rate": 5e-5, + "learning_rate": 5e-05, "train_batch_size": 64, - "metrics": ["eval_accuracy", "train_runtime", "train_samples_per_second"], + "metrics": [ + "eval_accuracy", + "train_runtime", + "train_samples_per_second" + ], "extra_arguments": [ "--remove_unused_columns False", "--image_column_name img", @@ -19,9 +23,13 @@ ] }, "multi_card": { - "learning_rate": 2e-4, + "learning_rate": 0.0002, "train_batch_size": 64, - "metrics": ["eval_accuracy", "train_runtime", "train_samples_per_second"], + "metrics": [ + "eval_accuracy", + "train_runtime", + "train_samples_per_second" + ], "extra_arguments": [ "--remove_unused_columns False", "--image_column_name img", @@ -42,9 +50,13 @@ "eval_batch_size": 64, "distribution": { "single_card": { - "learning_rate": 3e-5, + "learning_rate": 3e-05, "train_batch_size": 128, - "metrics": ["eval_accuracy", "train_runtime", "train_samples_per_second"], + "metrics": [ + "eval_accuracy", + "train_runtime", + "train_samples_per_second" + ], "extra_arguments": [ "--remove_unused_columns False", "--image_column_name img", @@ -56,9 +68,58 @@ ] }, "multi_card": { - "learning_rate": 2e-4, + "learning_rate": 0.0002, "train_batch_size": 128, - "metrics": ["eval_accuracy", "train_runtime", "train_samples_per_second"], + "metrics": [ + "eval_accuracy", + "train_runtime", + "train_samples_per_second" + ], + "extra_arguments": [ + "--remove_unused_columns False", + "--image_column_name img", + "--seed 1337", + "--use_hpu_graphs_for_inference", + "--dataloader_num_workers 1", + "--pipelining_fwd_bwd True", + "--non_blocking_data_copy True", + "--throughput_warmup_steps 8" + ] + } + } + } + }, + "gaudi3": { + "cifar10": { + "num_train_epochs": 1, + "eval_batch_size": 64, + "distribution": { + "single_card": { + "learning_rate": 3e-05, + "train_batch_size": 128, + "metrics": [ + "eval_accuracy", + "train_runtime", + "train_samples_per_second" + ], + "extra_arguments": [ + "--remove_unused_columns False", + "--image_column_name img", + "--seed 1337", + "--use_hpu_graphs_for_inference", + "--dataloader_num_workers 1", + "--pipelining_fwd_bwd True", + "--non_blocking_data_copy True" + ] + }, + "multi_card": { + "learning_rate": 0.0002, + "train_batch_size": 128, + "metrics": [ + "eval_accuracy", + "train_runtime", + "train_samples_per_second" + ], "extra_arguments": [ "--remove_unused_columns False", "--image_column_name img", @@ -73,4 +134,4 @@ } } } -} +} \ No newline at end of file diff --git a/tests/configs/examples/wav2vec2_base.json b/tests/configs/examples/wav2vec2_base.json index 3b8d8e2b70..b56f9ab86a 100644 --- a/tests/configs/examples/wav2vec2_base.json +++ b/tests/configs/examples/wav2vec2_base.json @@ -5,9 +5,14 @@ "eval_batch_size": 64, "distribution": { "multi_card": { - "learning_rate": 5e-4, + "learning_rate": 0.0005, "train_batch_size": 32, - "metrics": ["eval_accuracy", "eval_samples_per_second", "train_runtime", "train_samples_per_second"], + "metrics": [ + "eval_accuracy", + "eval_samples_per_second", + "train_runtime", + "train_samples_per_second" + ], "extra_arguments": [ "--audio_column_name audio", "--label_column_name language", @@ -31,9 +36,45 @@ "eval_batch_size": 64, "distribution": { "multi_card": { - "learning_rate": 3e-4, + "learning_rate": 0.0003, "train_batch_size": 32, - "metrics": ["eval_accuracy", "eval_samples_per_second", "train_runtime", "train_samples_per_second"], + "metrics": [ + "eval_accuracy", + "eval_samples_per_second", + "train_runtime", + "train_samples_per_second" + ], + "extra_arguments": [ + "--audio_column_name audio", + "--label_column_name language", + "--remove_unused_columns False", + "--max_length_seconds 8", + "--attention_mask False", + "--warmup_ratio 0.1", + "--seed 0", + "--dataloader_num_workers 1", + "--use_hpu_graphs_for_training", + "--use_hpu_graphs_for_inference", + "--trust_remote_code True" + ] + } + } + } + }, + "gaudi3": { + "common_language": { + "num_train_epochs": 5, + "eval_batch_size": 64, + "distribution": { + "multi_card": { + "learning_rate": 0.0003, + "train_batch_size": 32, + "metrics": [ + "eval_accuracy", + "eval_samples_per_second", + "train_runtime", + "train_samples_per_second" + ], "extra_arguments": [ "--audio_column_name audio", "--label_column_name language", @@ -51,4 +92,4 @@ } } } -} +} \ No newline at end of file diff --git a/tests/configs/examples/wav2vec2_large_lv60.json b/tests/configs/examples/wav2vec2_large_lv60.json index 3ac83a4638..648d299a4b 100644 --- a/tests/configs/examples/wav2vec2_large_lv60.json +++ b/tests/configs/examples/wav2vec2_large_lv60.json @@ -5,9 +5,14 @@ "eval_batch_size": 8, "distribution": { "multi_card": { - "learning_rate": 6e-4, + "learning_rate": 0.0006, "train_batch_size": 8, - "metrics": ["eval_wer", "eval_samples_per_second", "train_runtime", "train_samples_per_second"], + "metrics": [ + "eval_wer", + "eval_samples_per_second", + "train_runtime", + "train_samples_per_second" + ], "extra_arguments": [ "--dataset_config_name clean", "--train_split_name train.100", @@ -18,7 +23,7 @@ "--layerdrop 0.0", "--freeze_feature_encoder", "--dataloader_num_workers 8", - "--chars_to_ignore ',?.!-;:\"“%‘”'", + "--chars_to_ignore ',?.!-;:\"\u201c%\u2018\u201d'", "--trust_remote_code True" ] } @@ -31,9 +36,14 @@ "eval_batch_size": 8, "distribution": { "multi_card": { - "learning_rate": 4e-4, + "learning_rate": 0.0004, "train_batch_size": 8, - "metrics": ["eval_wer", "eval_samples_per_second", "train_runtime", "train_samples_per_second"], + "metrics": [ + "eval_wer", + "eval_samples_per_second", + "train_runtime", + "train_samples_per_second" + ], "extra_arguments": [ "--dataset_config_name clean", "--train_split_name train.100", @@ -44,7 +54,40 @@ "--layerdrop 0.0", "--freeze_feature_encoder", "--dataloader_num_workers 8", - "--chars_to_ignore ',?.!-;:\"“%‘”'", + "--chars_to_ignore ',?.!-;:\"\u201c%\u2018\u201d'", + "--use_hpu_graphs_for_training", + "--use_hpu_graphs_for_inference", + "--trust_remote_code True" + ] + } + } + } + }, + "gaudi3": { + "regisss/librispeech_asr_for_optimum_habana_ci": { + "num_train_epochs": 2, + "eval_batch_size": 8, + "distribution": { + "multi_card": { + "learning_rate": 0.0004, + "train_batch_size": 8, + "metrics": [ + "eval_wer", + "eval_samples_per_second", + "train_runtime", + "train_samples_per_second" + ], + "extra_arguments": [ + "--dataset_config_name clean", + "--train_split_name train.100", + "--eval_split_name validation", + "--preprocessing_num_workers 1", + "--warmup_steps 500", + "--text_column_name text", + "--layerdrop 0.0", + "--freeze_feature_encoder", + "--dataloader_num_workers 8", + "--chars_to_ignore ',?.!-;:\"\u201c%\u2018\u201d'", "--use_hpu_graphs_for_training", "--use_hpu_graphs_for_inference", "--trust_remote_code True" @@ -53,4 +96,4 @@ } } } -} +} \ No newline at end of file diff --git a/tests/configs/examples/whisper_small.json b/tests/configs/examples/whisper_small.json index b971a404da..4893b261ab 100644 --- a/tests/configs/examples/whisper_small.json +++ b/tests/configs/examples/whisper_small.json @@ -5,9 +5,14 @@ "eval_batch_size": 2, "distribution": { "multi_card": { - "learning_rate": 1e-4, + "learning_rate": 0.0001, "train_batch_size": 8, - "metrics": ["eval_wer", "eval_samples_per_second", "train_runtime", "train_samples_per_second"], + "metrics": [ + "eval_wer", + "eval_samples_per_second", + "train_runtime", + "train_samples_per_second" + ], "extra_arguments": [ "--dataset_config_name hi", "--language hindi", @@ -36,9 +41,49 @@ "eval_batch_size": 8, "distribution": { "multi_card": { - "learning_rate": 8e-5, + "learning_rate": 8e-05, "train_batch_size": 32, - "metrics": ["eval_wer", "eval_samples_per_second", "train_runtime", "train_samples_per_second"], + "metrics": [ + "eval_wer", + "eval_samples_per_second", + "train_runtime", + "train_samples_per_second" + ], + "extra_arguments": [ + "--dataset_config_name hi", + "--language hindi", + "--task transcribe", + "--train_split_name train+validation", + "--eval_split_name test", + "--preprocessing_num_workers 1", + "--generation_max_length 225", + "--max_duration_in_seconds 30", + "--text_column_name sentence", + "--freeze_feature_encoder False", + "--dataloader_num_workers 8", + "--predict_with_generate", + "--use_hpu_graphs_for_inference", + "--label_features_max_length 128", + "--trust_remote_code True" + ] + } + } + } + }, + "gaudi3": { + "mozilla-foundation/common_voice_11_0": { + "num_train_epochs": 10, + "eval_batch_size": 8, + "distribution": { + "multi_card": { + "learning_rate": 8e-05, + "train_batch_size": 32, + "metrics": [ + "eval_wer", + "eval_samples_per_second", + "train_runtime", + "train_samples_per_second" + ], "extra_arguments": [ "--dataset_config_name hi", "--language hindi", @@ -60,4 +105,4 @@ } } } -} +} \ No newline at end of file