1 год назад · f6617fb86a
--- a/docs/multi_gpu.md
+++ b/docs/multi_gpu.md
@@ -24,7 +24,7 @@ This runs with the `samsum_dataset` for summarization application by default.
 
				 
			
 
				 ```bash
			
 
				 
			
 
				-torchrun --nnodes 1 --nproc_per_node 4  examples/finetuning.py --enable_fsdp --model_name /path_of_model_folder/8B --use_peft --peft_method lora --output_dir Path/to/save/PEFT/model
			
 
				+torchrun --nnodes 1 --nproc_per_node 4  recipes/quickstart/finetuning/finetuning.py --enable_fsdp --model_name /path_of_model_folder/8B --use_peft --peft_method lora --output_dir Path/to/save/PEFT/model
			
 
				 
			
 
				 ```
			
 
				 
			
@@ -43,7 +43,7 @@ We use `torchrun` here to spawn multiple processes for FSDP.
 
				 Setting `use_fast_kernels` will enable using of Flash Attention or Xformer memory-efficient kernels based on the hardware being used. This would speed up the fine-tuning job. This has been enabled in `optimum` library from HuggingFace as a one-liner API, please read more [here](https://pytorch.org/blog/out-of-the-box-acceleration/).
			
 
				 
			
 
				 ```bash
			
 
				-torchrun --nnodes 1 --nproc_per_node 4  examples/finetuning.py --enable_fsdp --model_name /path_of_model_folder/8B --use_peft --peft_method lora --output_dir Path/to/save/PEFT/model --use_fast_kernels
			
 
				+torchrun --nnodes 1 --nproc_per_node 4  recipes/quickstart/finetuning/finetuning.py --enable_fsdp --model_name /path_of_model_folder/8B --use_peft --peft_method lora --output_dir Path/to/save/PEFT/model --use_fast_kernels
			
 
				 ```
			
 
				 
			
 
				 ### Fine-tuning using FSDP Only
			
@@ -52,7 +52,7 @@ If interested in running full parameter finetuning without making use of PEFT me
 
				 
			
 
				 ```bash
			
 
				 
			
 
				-torchrun --nnodes 1 --nproc_per_node 8  examples/finetuning.py --enable_fsdp --model_name /path_of_model_folder/8B --dist_checkpoint_root_folder model_checkpoints --dist_checkpoint_folder fine-tuned --pure_bf16 --use_fast_kernels
			
 
				+torchrun --nnodes 1 --nproc_per_node 8  recipes/quickstart/finetuning/finetuning.py --enable_fsdp --model_name /path_of_model_folder/8B --dist_checkpoint_root_folder model_checkpoints --dist_checkpoint_folder fine-tuned --fsdp_config.pure_bf16 --use_fast_kernels
			
 
				 
			
 
				 ```
			
 
				 
			
@@ -62,7 +62,7 @@ If you are interested in running full parameter fine-tuning on the 70B model, yo
 
				 
			
 
				 ```bash
			
 
				 
			
 
				-torchrun --nnodes 1 --nproc_per_node 8 examples/finetuning.py --enable_fsdp --low_cpu_fsdp --pure_bf16 --model_name /path_of_model_folder/70B --batch_size_training 1 --dist_checkpoint_root_folder model_checkpoints --dist_checkpoint_folder fine-tuned
			
 
				+torchrun --nnodes 1 --nproc_per_node 8 recipes/quickstart/finetuning/finetuning.py --enable_fsdp --low_cpu_fsdp --fsdp_config.pure_bf16 --model_name /path_of_model_folder/70B --batch_size_training 1 --dist_checkpoint_root_folder model_checkpoints --dist_checkpoint_folder fine-tuned
			
 
				 
			
 
				 ```
			
 
				 
			
@@ -72,7 +72,7 @@ Here we use a slurm script to schedule a job with slurm over multiple nodes.
 
				 
			
 
				 ```bash
			
 
				 
			
 
				-sbatch examples/multi_node.slurm
			
 
				+sbatch recipes/quickstart/finetuning/multi_node.slurm
			
 
				 # Change the num nodes and GPU per nodes in the script before running.
			
 
				 
			
 
				 ```
			
@@ -95,16 +95,16 @@ To run with each of the datasets set the `dataset` flag in the command as shown
 
				 
			
 
				 ```bash
			
 
				 # grammer_dataset
			
 
				-torchrun --nnodes 1 --nproc_per_node 4  examples/finetuning.py --enable_fsdp  --model_name /path_of_model_folder/8B --use_peft --peft_method lora --dataset grammar_dataset --save_model --dist_checkpoint_root_folder model_checkpoints --dist_checkpoint_folder fine-tuned  --pure_bf16 --output_dir Path/to/save/PEFT/model
			
 
				+torchrun --nnodes 1 --nproc_per_node 4  recipes/quickstart/finetuning/finetuning.py --enable_fsdp  --model_name /path_of_model_folder/8B --use_peft --peft_method lora --dataset grammar_dataset --save_model --dist_checkpoint_root_folder model_checkpoints --dist_checkpoint_folder fine-tuned  --fsdp_config.pure_bf16 --output_dir Path/to/save/PEFT/model
			
 
				 
			
 
				 # alpaca_dataset
			
 
				 
			
 
				-torchrun --nnodes 1 --nproc_per_node 4  examples/finetuning.py --enable_fsdp  --model_name /path_of_model_folder/8B --use_peft --peft_method lora --dataset alpaca_dataset --save_model --dist_checkpoint_root_folder model_checkpoints --dist_checkpoint_folder fine-tuned --pure_bf16 --output_dir Path/to/save/PEFT/model
			
 
				+torchrun --nnodes 1 --nproc_per_node 4  recipes/quickstart/finetuning/finetuning.py --enable_fsdp  --model_name /path_of_model_folder/8B --use_peft --peft_method lora --dataset alpaca_dataset --save_model --dist_checkpoint_root_folder model_checkpoints --dist_checkpoint_folder fine-tuned --fsdp_config.pure_bf16 --output_dir Path/to/save/PEFT/model
			
 
				 
			
 
				 
			
 
				 # samsum_dataset
			
 
				 
			
 
				-torchrun --nnodes 1 --nproc_per_node 4  examples/finetuning.py --enable_fsdp --model_name /path_of_model_folder/8B --use_peft --peft_method lora --dataset samsum_dataset --save_model --dist_checkpoint_root_folder model_checkpoints --dist_checkpoint_folder fine-tuned --pure_bf16 --output_dir Path/to/save/PEFT/model
			
 
				+torchrun --nnodes 1 --nproc_per_node 4  recipes/quickstart/finetuning/finetuning.py --enable_fsdp --model_name /path_of_model_folder/8B --use_peft --peft_method lora --dataset samsum_dataset --save_model --dist_checkpoint_root_folder model_checkpoints --dist_checkpoint_folder fine-tuned --fsdp_config.pure_bf16 --output_dir Path/to/save/PEFT/model
			
 
				 
			
 
				 ```
			
 
				 
			
@@ -182,7 +182,7 @@ It lets us specify the training settings for everything from `model_name` to `da
 
				 
			
 
				 * `fsdp_activation_checkpointing` enables activation checkpoining for FSDP, this saves significant amount of memory with the trade off of recomputing itermediate activations during the backward pass. The saved memory can be re-invested in higher batch sizes to increase the throughput. We recommond you use this option.
			
 
				 
			
 
				-* `pure_bf16` it moves the  model to `BFloat16` and if `optimizer` is set to `anyprecision` then optimizer states will be kept in `BFloat16` as well. You can use this option if necessary.
			
 
				+* `fsdp_config.pure_bf16` it moves the  model to `BFloat16` and if `optimizer` is set to `anyprecision` then optimizer states will be kept in `BFloat16` as well. You can use this option if necessary.
			
 
				 
			
 
				 ## FLOPS Counting and Pytorch Profiling
			
 
				 
			
--- a/recipes/quickstart/finetuning/datasets/README.md
+++ b/recipes/quickstart/finetuning/datasets/README.md
@@ -32,18 +32,18 @@ To supply a custom dataset you need to provide a single .py file which contains
 
				 ```@python
			
 
				 def get_custom_dataset(dataset_config, tokenizer, split: str):
			
 
				 ```
			
 
				-For an example `get_custom_dataset` you can look at the provided datasets in llama_recipes.datasets or [examples/custom_dataset.py](custom_dataset.py).
			
 
				+For an example `get_custom_dataset` you can look at the provided datasets in llama_recipes.datasets or [custom_dataset.py](./custom_dataset.py).
			
 
				 The `dataset_config` in the above signature will be an instance of llama_recipes.configs.dataset.custom_dataset with the modifications made through the command line.
			
 
				 The split signals wether to return the training or validation dataset.
			
 
				 The default function name is `get_custom_dataset` but this can be changed as described below.
			
 
				 
			
 
				 In order to start a training with the custom dataset we need to set the `--dataset` as well as the `--custom_dataset.file` parameter.
			
 
				 ```
			
 
				-python -m llama_recipes.finetuning --dataset "custom_dataset" --custom_dataset.file "examples/custom_dataset.py" [TRAINING PARAMETERS]
			
 
				+python -m llama_recipes.finetuning --dataset "custom_dataset" --custom_dataset.file "custom_dataset.py" [TRAINING PARAMETERS]
			
 
				 ```
			
 
				 To change the function name that is used in the .py you can append the name following a `:` like this:
			
 
				 ```
			
 
				-python -m llama_recipes.finetuning --dataset "custom_dataset" --custom_dataset.file "examples/custom_dataset.py:get_foo" [TRAINING PARAMETERS]
			
 
				+python -m llama_recipes.finetuning --dataset "custom_dataset" --custom_dataset.file "custom_dataset.py:get_foo" [TRAINING PARAMETERS]
			
 
				 ```
			
 
				 This will call the function `get_foo` instead of `get_custom_dataset` when retrieving the dataset.
			
 
				 
			
--- a/recipes/quickstart/finetuning/multigpu_finetuning.md
+++ b/recipes/quickstart/finetuning/multigpu_finetuning.md
@@ -49,7 +49,7 @@ The args used in the command above are:
 
				 If interested in running full parameter finetuning without making use of PEFT methods, please use the following command. Make sure to change the `nproc_per_node` to your available GPUs. This has been tested with `BF16` on 8xA100, 40GB GPUs.
			
 
				 
			
 
				 ```bash
			
 
				-torchrun --nnodes 1 --nproc_per_node 8  finetuning.py --enable_fsdp --model_name /path_of_model_folder/8B --dist_checkpoint_root_folder model_checkpoints --dist_checkpoint_folder fine-tuned --pure_bf16 --use_fast_kernels
			
 
				+torchrun --nnodes 1 --nproc_per_node 8  finetuning.py --enable_fsdp --model_name /path_of_model_folder/8B --dist_checkpoint_root_folder model_checkpoints --dist_checkpoint_folder fine-tuned --fsdp_config.pure_bf16 --use_fast_kernels
			
 
				 ```
			
 
				 
			
 
				 ### Using less CPU memory (FSDP on 70B model)
			
@@ -57,7 +57,7 @@ torchrun --nnodes 1 --nproc_per_node 8  finetuning.py --enable_fsdp --model_name
 
				 If you are running full parameter fine-tuning on the 70B model, you can enable `low_cpu_fsdp` mode as the following command. This option will load model on rank0 only before moving model to devices to construct FSDP. This can dramatically save cpu memory when loading large models like 70B (on a 8-gpu node, this reduces cpu memory from 2+T to 280G for 70B model). This has been tested with `BF16` on 16xA100, 80GB GPUs.
			
 
				 
			
 
				 ```bash
			
 
				-torchrun --nnodes 1 --nproc_per_node 8 finetuning.py --enable_fsdp --low_cpu_fsdp --pure_bf16 --model_name /path_of_model_folder/70B --batch_size_training 1 --dist_checkpoint_root_folder model_checkpoints --dist_checkpoint_folder fine-tuned
			
 
				+torchrun --nnodes 1 --nproc_per_node 8 finetuning.py --enable_fsdp --low_cpu_fsdp --fsdp_config.pure_bf16 --model_name /path_of_model_folder/70B --batch_size_training 1 --dist_checkpoint_root_folder model_checkpoints --dist_checkpoint_folder fine-tuned
			
 
				 ```
			
 
				 
			
 
				 
			
@@ -79,16 +79,16 @@ To run with each of the datasets set the `dataset` flag in the command as shown
 
				 
			
 
				 ```bash
			
 
				 # grammer_dataset
			
 
				-torchrun --nnodes 1 --nproc_per_node 4  finetuning.py --enable_fsdp  --model_name /path_of_model_folder/8B --use_peft --peft_method lora --dataset grammar_dataset --save_model --dist_checkpoint_root_folder model_checkpoints --dist_checkpoint_folder fine-tuned  --pure_bf16 --output_dir Path/to/save/PEFT/model
			
 
				+torchrun --nnodes 1 --nproc_per_node 4  finetuning.py --enable_fsdp  --model_name /path_of_model_folder/8B --use_peft --peft_method lora --dataset grammar_dataset --save_model --dist_checkpoint_root_folder model_checkpoints --dist_checkpoint_folder fine-tuned  --fsdp_config.pure_bf16 --output_dir Path/to/save/PEFT/model
			
 
				 
			
 
				 # alpaca_dataset
			
 
				 
			
 
				-torchrun --nnodes 1 --nproc_per_node 4  finetuning.py --enable_fsdp  --model_name /path_of_model_folder/8B --use_peft --peft_method lora --dataset alpaca_dataset --save_model --dist_checkpoint_root_folder model_checkpoints --dist_checkpoint_folder fine-tuned --pure_bf16 --output_dir Path/to/save/PEFT/model
			
 
				+torchrun --nnodes 1 --nproc_per_node 4  finetuning.py --enable_fsdp  --model_name /path_of_model_folder/8B --use_peft --peft_method lora --dataset alpaca_dataset --save_model --dist_checkpoint_root_folder model_checkpoints --dist_checkpoint_folder fine-tuned --fsdp_config.pure_bf16 --output_dir Path/to/save/PEFT/model
			
 
				 
			
 
				 
			
 
				 # samsum_dataset
			
 
				 
			
 
				-torchrun --nnodes 1 --nproc_per_node 4  finetuning.py --enable_fsdp --model_name /path_of_model_folder/8B --use_peft --peft_method lora --dataset samsum_dataset --save_model --dist_checkpoint_root_folder model_checkpoints --dist_checkpoint_folder fine-tuned --pure_bf16 --output_dir Path/to/save/PEFT/model
			
 
				+torchrun --nnodes 1 --nproc_per_node 4  finetuning.py --enable_fsdp --model_name /path_of_model_folder/8B --use_peft --peft_method lora --dataset samsum_dataset --save_model --dist_checkpoint_root_folder model_checkpoints --dist_checkpoint_folder fine-tuned --fsdp_config.pure_bf16 --output_dir Path/to/save/PEFT/model
			
 
				 
			
 
				 ```
			
 
				 
			
--- a/recipes/quickstart/inference/local_inference/README.md
+++ b/recipes/quickstart/inference/local_inference/README.md
@@ -69,7 +69,7 @@ In case you have fine-tuned your model with pure FSDP and saved the checkpoints
 
				 This is helpful if you have fine-tuned you model using FSDP only as follows:
			
 
				 
			
 
				 ```bash
			
 
				-torchrun --nnodes 1 --nproc_per_node 8  recipes/quickstart/finetuning/finetuning.py --enable_fsdp --model_name /path_of_model_folder/7B --dist_checkpoint_root_folder model_checkpoints --dist_checkpoint_folder fine-tuned --pure_bf16
			
 
				+torchrun --nnodes 1 --nproc_per_node 8  recipes/quickstart/finetuning/finetuning.py --enable_fsdp --model_name /path_of_model_folder/7B --dist_checkpoint_root_folder model_checkpoints --dist_checkpoint_folder fine-tuned --fsdp_config.pure_bf16
			
 
				 ```
			
 
				 Then convert your FSDP checkpoint to HuggingFace checkpoints using:
			
 
				 ```bash
			
--- a/src/llama_recipes/configs/datasets.py
+++ b/src/llama_recipes/configs/datasets.py
@@ -29,6 +29,6 @@ class alpaca_dataset:
 
				 @dataclass
			
 
				 class custom_dataset:
			
 
				     dataset: str = "custom_dataset"
			
 
				-    file: str = "examples/custom_dataset.py"
			
 
				+    file: str = "recipes/quickstart/finetuning/datasets/custom_dataset.py"
			
 
				     train_split: str = "train"
			
 
				-    test_split: str = "validation"
			
 
				+    test_split: str = "validation"