From 777265a185391354b49ddba7877de7a8f4f8b86f Mon Sep 17 00:00:00 2001 From: Kaiyu Yang Date: Tue, 2 Jul 2024 18:27:20 +0000 Subject: [PATCH] add torchtune --- torchtune/confs/llama3-8B_full.yaml | 77 +++++++++++++++++++++++++++++ 1 file changed, 77 insertions(+) create mode 100644 torchtune/confs/llama3-8B_full.yaml diff --git a/torchtune/confs/llama3-8B_full.yaml b/torchtune/confs/llama3-8B_full.yaml new file mode 100644 index 0000000..d2d060d --- /dev/null +++ b/torchtune/confs/llama3-8B_full.yaml @@ -0,0 +1,77 @@ +# Config for multi-device full finetuning in full_finetune_distributed.py +# using a Llama3 8B model +# +# This config assumes that you've run the following command before launching +# this run: +# tune download meta-llama/Meta-Llama-3-8B --output-dir /tmp/Meta-Llama-3-8B --hf-token +# +# To launch on 4 devices, run the following command from root: +# tune run --nproc_per_node 4 full_finetune_distributed --config llama3/8B_full +# +# You can add specific overrides through the command line. For example +# to override the checkpointer directory while launching training +# you can run: +# tune run --nproc_per_node 4 full_finetune_distributed --config llama3/8B_full checkpointer.checkpoint_dir= +# +# This config works best when the model is being fine-tuned on 2+ GPUs. +# Single device full finetuning requires more memory optimizations. It's +# best to use 8B_full_single_device.yaml for those cases + + +# Tokenizer +tokenizer: + _component_: torchtune.models.llama3.llama3_tokenizer + path: /tmp/Meta-Llama-3-8B/original/tokenizer.model + +# Dataset +dataset: + _component_: torchtune.datasets.alpaca_dataset + train_on_input: True +seed: null +shuffle: True + +# Model Arguments +model: + _component_: torchtune.models.llama3.llama3_8b + +checkpointer: + _component_: torchtune.utils.FullModelMetaCheckpointer + checkpoint_dir: /tmp/Meta-Llama-3-8B/original/ + checkpoint_files: [ + consolidated.00.pth + ] + recipe_checkpoint: null + output_dir: /tmp/Meta-Llama-3-8B/ + model_type: LLAMA3 +resume_from_checkpoint: False + +# Fine-tuning arguments +batch_size: 2 +epochs: 3 + +optimizer: + _component_: torch.optim.AdamW + lr: 2e-5 + foreach: False + +loss: + _component_: torch.nn.CrossEntropyLoss +max_steps_per_epoch: null +gradient_accumulation_steps: 1 + + +# Training env +device: cuda + +# Memory management +enable_activation_checkpointing: True + +# Reduced precision +dtype: bf16 + +# Logging +metric_logger: + _component_: torchtune.utils.metric_logging.DiskLogger + log_dir: ${output_dir} +output_dir: /tmp/alpaca-llama3-finetune +log_every_n_steps: null