2 rokov pred · 26e877fd42
--- a/docs/multi_gpu.md
+++ b/docs/multi_gpu.md
@@ -115,35 +115,47 @@ torchrun --nnodes 1 --nproc_per_node 4  examples/finetuning.py --enable_fsdp --m
 
																 It lets us specify the training settings for everything from `model_name` to `dataset_name`, `batch_size` and so on. Below is the list of supported settings:
															
 
																 ```python
															
 
																-
															
 
																-model_name: str="PATH/to/LLAMA 2/7B"
															
 
																-enable_fsdp: bool= False
															
 
																-run_validation: bool=True
															
 
																-batch_size_training: int=4
															
 
																-gradient_accumulation_steps: int=1
															
 
																-num_epochs: int=3
															
 
																-num_workers_dataloader: int=2
															
 
																-lr: float=2e-4
															
 
																-weight_decay: float=0.0
															
 
																-gamma: float= 0.85
															
 
																-use_fp16: bool=False
															
 
																-mixed_precision: bool=True
															
 
																-val_batch_size: int=4
															
 
																-dataset = "samsum_dataset" # alpaca_dataset, grammar_dataset
															
 
																-peft_method: str = "lora" # None , llama_adapter, prefix
															
 
																-use_peft: bool=False
															
 
																-output_dir: str = "./ft-output"
															
 
																-freeze_layers: bool = False
															
 
																-num_freeze_layers: int = 1
															
 
																-quantization: bool = False
															
 
																-save_model: bool = False
															
 
																-dist_checkpoint_root_folder: str="model_checkpoints"
															
 
																-dist_checkpoint_folder: str="fine-tuned"
															
 
																-save_optimizer: bool=False
															
 
																-flop_counter: bool=False # Enable FLOPS counter to measure model throughput, can not be used with pytorch profiler at the same time.
															
 
																-flop_counter_start: int=3 # The step to start profiling, default is 3, which means after 3 steps of warm-up stage, the profiler will start to count FLOPS.
															
 
																-use_profiler: bool=False # Enable pytorch profiler, can not be used with FLOPS counter at the same time.
															
 
																-profiler_dir: str="PATH/to/save/profiler/results" # will be used if using profiler
															
 
																+    model_name: str="PATH/to/Model"
															
 
																+    tokenizer_name: str=None
															
 
																+    enable_fsdp: bool=False
															
 
																+    low_cpu_fsdp: bool=False
															
 
																+    run_validation: bool=True
															
 
																+    batch_size_training: int=4
															
 
																+    batching_strategy: str="packing" #alternative: padding
															
 
																+    context_length: int=4096
															
 
																+    gradient_accumulation_steps: int=1
															
 
																+    gradient_clipping: bool = False
															
 
																+    gradient_clipping_threshold: float = 1.0
															
 
																+    num_epochs: int=3
															
 
																+    max_train_step: int=0
															
 
																+    max_eval_step: int=0
															
 
																+    num_workers_dataloader: int=1
															
 
																+    lr: float=1e-4
															
 
																+    weight_decay: float=0.0
															
 
																+    gamma: float= 0.85
															
 
																+    seed: int=42
															
 
																+    use_fp16: bool=False
															
 
																+    mixed_precision: bool=True
															
 
																+    val_batch_size: int=1
															
 
																+    dataset = "samsum_dataset"
															
 
																+    peft_method: str = "lora" # None,llama_adapter, prefix
															
 
																+    use_peft: bool=False
															
 
																+    output_dir: str = "PATH/to/save/PEFT/model"
															
 
																+    freeze_layers: bool = False
															
 
																+    num_freeze_layers: int = 1
															
 
																+    quantization: bool = False
															
 
																+    one_gpu: bool = False
															
 
																+    save_model: bool = True
															
 
																+    dist_checkpoint_root_folder: str="PATH/to/save/FSDP/model" # will be used if using FSDP
															
 
																+    dist_checkpoint_folder: str="fine-tuned" # will be used if using FSDP
															
 
																+    save_optimizer: bool=False # will be used if using FSDP
															
 
																+    use_fast_kernels: bool = False # Enable using SDPA from PyTroch Accelerated Transformers, make use Flash Attention and Xformer memory-efficient kernels
															
 
																+    use_wandb: bool = False # Enable wandb for experient tracking
															
 
																+    save_metrics: bool = False # saves training metrics to a json file for later plotting
															
 
																+    flop_counter: bool = False # Enable flop counter to measure model throughput, can not be used with pytorch profiler at the same time.
															
 
																+    flop_counter_start: int = 3 # The step to start profiling, default is 3, which means after 3 steps of warmup stage, the profiler will start to count flops.
															
 
																+    use_profiler: bool = False # Enable pytorch profiler, can not be used with flop counter at the same time.
															
 
																+    profiler_dir: str = "PATH/to/save/profiler/results" # will be used if using profiler
															
 
																 ```
															
 
																 * [Datasets config file](../src/llama_recipes/configs/datasets.py) provides the available options for datasets.
															
--- a/docs/single_gpu.md
+++ b/docs/single_gpu.md
@@ -71,36 +71,47 @@ python -m llama_recipes.finetuning  --use_peft --peft_method lora --quantization
 
																 It let us specify the training settings, everything from `model_name` to `dataset_name`, `batch_size` etc. can be set here. Below is the list of supported settings:
															
 
																 ```python
															
 
																-
															
 
																-model_name: str="PATH/to/LLAMA 2/7B"
															
 
																-enable_fsdp: bool= False
															
 
																-run_validation: bool=True
															
 
																-batch_size_training: int=4
															
 
																-gradient_accumulation_steps: int=1
															
 
																-num_epochs: int=3
															
 
																-num_workers_dataloader: int=2
															
 
																-lr: float=2e-4
															
 
																-weight_decay: float=0.0
															
 
																-gamma: float= 0.85
															
 
																-use_fp16: bool=False
															
 
																-mixed_precision: bool=True
															
 
																-val_batch_size: int=4
															
 
																-dataset = "samsum_dataset" # alpaca_dataset,grammar_dataset
															
 
																-peft_method: str = "lora" # None , llama_adapter, prefix
															
 
																-use_peft: bool=False
															
 
																-output_dir: str = "./ft-output"
															
 
																-freeze_layers: bool = False
															
 
																-num_freeze_layers: int = 1
															
 
																-quantization: bool = False
															
 
																-one_gpu: bool = False
															
 
																-save_model: bool = False
															
 
																-dist_checkpoint_root_folder: str="model_checkpoints"
															
 
																-dist_checkpoint_folder: str="fine-tuned"
															
 
																-save_optimizer: bool=False
															
 
																-flop_counter: bool=False # Enable FLOPS counter to measure model throughput, can not be used with pytorch profiler at the same time.
															
 
																-flop_counter_start: int=3 # The step to start profiling, default is 3, which means after 3 steps of warm-up stage, the profiler will start to count FLOPS.
															
 
																-use_profiler: bool=False # Enable pytorch profiler, can not be used with FLOPS counter at the same time.
															
 
																-profiler_dir: str="PATH/to/save/profiler/results" # will be used if using profiler
															
 
																+    model_name: str="PATH/to/Model"
															
 
																+    tokenizer_name: str=None
															
 
																+    enable_fsdp: bool=False
															
 
																+    low_cpu_fsdp: bool=False
															
 
																+    run_validation: bool=True
															
 
																+    batch_size_training: int=4
															
 
																+    batching_strategy: str="packing" #alternative: padding
															
 
																+    context_length: int=4096
															
 
																+    gradient_accumulation_steps: int=1
															
 
																+    gradient_clipping: bool = False
															
 
																+    gradient_clipping_threshold: float = 1.0
															
 
																+    num_epochs: int=3
															
 
																+    max_train_step: int=0
															
 
																+    max_eval_step: int=0
															
 
																+    num_workers_dataloader: int=1
															
 
																+    lr: float=1e-4
															
 
																+    weight_decay: float=0.0
															
 
																+    gamma: float= 0.85
															
 
																+    seed: int=42
															
 
																+    use_fp16: bool=False
															
 
																+    mixed_precision: bool=True
															
 
																+    val_batch_size: int=1
															
 
																+    dataset = "samsum_dataset"
															
 
																+    peft_method: str = "lora" # None,llama_adapter, prefix
															
 
																+    use_peft: bool=False
															
 
																+    output_dir: str = "PATH/to/save/PEFT/model"
															
 
																+    freeze_layers: bool = False
															
 
																+    num_freeze_layers: int = 1
															
 
																+    quantization: bool = False
															
 
																+    one_gpu: bool = False
															
 
																+    save_model: bool = True
															
 
																+    dist_checkpoint_root_folder: str="PATH/to/save/FSDP/model" # will be used if using FSDP
															
 
																+    dist_checkpoint_folder: str="fine-tuned" # will be used if using FSDP
															
 
																+    save_optimizer: bool=False # will be used if using FSDP
															
 
																+    use_fast_kernels: bool = False # Enable using SDPA from PyTroch Accelerated Transformers, make use Flash Attention and Xformer memory-efficient kernels
															
 
																+    use_wandb: bool = False # Enable wandb for experient tracking
															
 
																+    save_metrics: bool = False # saves training metrics to a json file for later plotting
															
 
																+    flop_counter: bool = False # Enable flop counter to measure model throughput, can not be used with pytorch profiler at the same time.
															
 
																+    flop_counter_start: int = 3 # The step to start profiling, default is 3, which means after 3 steps of warmup stage, the profiler will start to count flops.
															
 
																+    use_profiler: bool = False # Enable pytorch profiler, can not be used with flop counter at the same time.
															
 
																+    profiler_dir: str = "PATH/to/save/profiler/results" # will be used if using profiler
															
 
																 ```
															
 
																 * [Datasets config file](../src/llama_recipes/configs/datasets.py) provides the available options for datasets.
															
--- a/recipes/finetuning/README.md
+++ b/recipes/finetuning/README.md
@@ -23,37 +23,47 @@ If you are new to fine-tuning techniques, check out an overview: [](./LLM_finetu
 
																 It lets us specify the training settings for everything from `model_name` to `dataset_name`, `batch_size` and so on. Below is the list of supported settings:
															
 
																 ```python
															
 
																-
															
 
																-model_name: str="PATH/to/LLAMA 2/7B"
															
 
																-enable_fsdp: bool=False
															
 
																-run_validation: bool=True
															
 
																-batch_size_training: int=4
															
 
																-gradient_accumulation_steps: int=1
															
 
																-max_train_step: int=0
															
 
																-max_eval_step: int=0
															
 
																-num_epochs: int=3
															
 
																-num_workers_dataloader: int=2
															
 
																-lr: float=2e-4
															
 
																-weight_decay: float=0.0
															
 
																-gamma: float=0.85
															
 
																-use_fp16: bool=False
															
 
																-mixed_precision: bool=True
															
 
																-val_batch_size: int=4
															
 
																-dataset = "samsum_dataset" # alpaca_dataset, grammar_dataset
															
 
																-peft_method: str="lora" # None , llama_adapter, prefix
															
 
																-use_peft: bool=False
															
 
																-output_dir: str="./ft-output"
															
 
																-freeze_layers: bool = False
															
 
																-num_freeze_layers: int = 1
															
 
																-quantization: bool = False
															
 
																-save_model: bool = False
															
 
																-dist_checkpoint_root_folder: str="model_checkpoints"
															
 
																-dist_checkpoint_folder: str="fine-tuned"
															
 
																-save_optimizer: bool=False
															
 
																-flop_counter: bool=False # Enable FLOPS counter to measure model throughput, can not be used with pytorch profiler at the same time.
															
 
																-flop_counter_start: int=3 # The step to start profiling, default is 3, which means after 3 steps of warm-up stage, the profiler will start to count FLOPS.
															
 
																-use_profiler: bool=False # Enable pytorch profiler, can not be used with FLOPS counter at the same time.
															
 
																-profiler_dir: str="PATH/to/save/profiler/results" # will be used if using profiler
															
 
																+    model_name: str="PATH/to/Model"
															
 
																+    tokenizer_name: str=None
															
 
																+    enable_fsdp: bool=False
															
 
																+    low_cpu_fsdp: bool=False
															
 
																+    run_validation: bool=True
															
 
																+    batch_size_training: int=4
															
 
																+    batching_strategy: str="packing" #alternative: padding
															
 
																+    context_length: int=4096
															
 
																+    gradient_accumulation_steps: int=1
															
 
																+    gradient_clipping: bool = False
															
 
																+    gradient_clipping_threshold: float = 1.0
															
 
																+    num_epochs: int=3
															
 
																+    max_train_step: int=0
															
 
																+    max_eval_step: int=0
															
 
																+    num_workers_dataloader: int=1
															
 
																+    lr: float=1e-4
															
 
																+    weight_decay: float=0.0
															
 
																+    gamma: float= 0.85
															
 
																+    seed: int=42
															
 
																+    use_fp16: bool=False
															
 
																+    mixed_precision: bool=True
															
 
																+    val_batch_size: int=1
															
 
																+    dataset = "samsum_dataset"
															
 
																+    peft_method: str = "lora" # None,llama_adapter, prefix
															
 
																+    use_peft: bool=False
															
 
																+    output_dir: str = "PATH/to/save/PEFT/model"
															
 
																+    freeze_layers: bool = False
															
 
																+    num_freeze_layers: int = 1
															
 
																+    quantization: bool = False
															
 
																+    one_gpu: bool = False
															
 
																+    save_model: bool = True
															
 
																+    dist_checkpoint_root_folder: str="PATH/to/save/FSDP/model" # will be used if using FSDP
															
 
																+    dist_checkpoint_folder: str="fine-tuned" # will be used if using FSDP
															
 
																+    save_optimizer: bool=False # will be used if using FSDP
															
 
																+    use_fast_kernels: bool = False # Enable using SDPA from PyTroch Accelerated Transformers, make use Flash Attention and Xformer memory-efficient kernels
															
 
																+    use_wandb: bool = False # Enable wandb for experient tracking
															
 
																+    save_metrics: bool = False # saves training metrics to a json file for later plotting
															
 
																+    flop_counter: bool = False # Enable flop counter to measure model throughput, can not be used with pytorch profiler at the same time.
															
 
																+    flop_counter_start: int = 3 # The step to start profiling, default is 3, which means after 3 steps of warmup stage, the profiler will start to count flops.
															
 
																+    use_profiler: bool = False # Enable pytorch profiler, can not be used with flop counter at the same time.
															
 
																+    profiler_dir: str = "PATH/to/save/profiler/results" # will be used if using profiler
															
 
																 ```
															
 
																 * [Datasets config file](../../src/llama_recipes/configs/datasets.py) provides the available options for datasets.
															
--- a/src/llama_recipes/configs/training.py
+++ b/src/llama_recipes/configs/training.py
@@ -6,7 +6,7 @@ from dataclasses import dataclass
 
																 @dataclass
															
 
																 class train_config:
															
 
																-    model_name: str="PATH/to/LLAMA/7B"
															
 
																+    model_name: str="PATH/to/Model"
															
 
																     tokenizer_name: str=None
															
 
																     enable_fsdp: bool=False
															
 
																     low_cpu_fsdp: bool=False
															
@@ -29,7 +29,7 @@ class train_config:
 
																     mixed_precision: bool=True
															
 
																     val_batch_size: int=1
															
 
																     dataset = "samsum_dataset"
															
 
																-    peft_method: str = "lora" # None , llama_adapter, prefix
															
 
																+    peft_method: str = "lora" # None,llama_adapter, prefix
															
 
																     use_peft: bool=False
															
 
																     output_dir: str = "PATH/to/save/PEFT/model"
															
 
																     freeze_layers: bool = False
															
--- a/src/llama_recipes/utils/flop_utils.py
+++ b/src/llama_recipes/utils/flop_utils.py
@@ -1,5 +1,5 @@
 
																 from typing import Any, Dict, List, Optional, Union
															
 
																-
															
 
																+import time
															
 
																 import torch
															
 
																 from torch.utils.flop_counter import FlopCounterMode
															
@@ -15,14 +15,12 @@ class FlopMeasure(FlopCounterMode):
 
																     .. code-block:: python
															
 
																-        mod = ...
															
 
																-        flop_counter = FlopMeasure(mod)
															
 
																+        model = ...
															
 
																+        flop_counter = FlopMeasure(model,local_rank=0,warmup_step=3)
															
 
																         for batch in enumerate(dataloader):
															
 
																             with flop_counter:
															
 
																-                if step == 3:
															
 
																-                    flop_counter.start_counting()
															
 
																-                mod(batch)
															
 
																-                flop_counter.stop_counting()
															
 
																+                model(batch)
															
 
																+                flop_counter.step()
															
 
																     """
															
 
																     def __init__(
															
@@ -32,50 +30,58 @@ class FlopMeasure(FlopCounterMode):
 
																         display: bool = True,
															
 
																         custom_mapping: Dict[Any, Any] = None,
															
 
																         rank=None,
															
 
																+        warmup_step: int = 3,
															
 
																     ):
															
 
																         super().__init__(mods, depth, display, custom_mapping)
															
 
																-        self.ready = False
															
 
																         self.rank = rank
															
 
																+        self.warmup_step = warmup_step
															
 
																+        self.start_time = 0
															
 
																+        self.end_time = 0
															
 
																+    def step(self):
															
 
																+        # decrease the warmup step by 1 for every step, so that the flop counting will start when warmup_step =0. Stop decreasing when warm_up reaches -1.
															
 
																+        if self.warmup_step >= 0:
															
 
																+            self.warmup_step -= 1
															
 
																+        if self.warmup_step == 0 and self.start_time == 0:
															
 
																+            self.start_time = time.time()
															
 
																+        elif self.warmup_step == -1 and self.start_time != 0 and self.end_time == 0:
															
 
																+            self.end_time = time.time()
															
 
																     def __enter__(self):
															
 
																-        self.ready = False
															
 
																+        if self.warmup_step == 0:
															
 
																+            self.start_time = time.time()
															
 
																         super().__enter__()
															
 
																         return self
															
 
																-
															
 
																+    def is_done(self):
															
 
																+        return self.warmup_step == -1
															
 
																     def get_total_flops(self):
															
 
																         return super().get_total_flops()
															
 
																-
															
 
																+    def get_flops_per_sec(self):
															
 
																+        if self.start_time == 0 or self.end_time == 0:
															
 
																+            print("Warning: flop count did not finish correctly")
															
 
																+            return 0
															
 
																+        return super().get_total_flops()/ (self.end_time - self.start_time)
															
 
																     def get_table(self, depth=2):
															
 
																         return super().get_table(depth)
															
 
																     def __exit__(self, *args):
															
 
																-        self.ready = False
															
 
																         if self.get_total_flops() == 0:
															
 
																             print(
															
 
																                 "Warning: did not record any flops this time. Skipping the flop report"
															
 
																             )
															
 
																         else:
															
 
																-            self.stop_counting()
															
 
																             if self.display:
															
 
																                 if self.rank is None or self.rank == 0:
															
 
																-                    print("self.flop_counts", self.get_total_flops())
															
 
																+                    print("Total time used in this flop counting step is: {}".format(self.end_time - self.start_time))
															
 
																+                    print("The total TFlop per second is: {}".format(self.get_flops_per_sec() / 1e12))
															
 
																+                    print("The tflop_count table is below:")
															
 
																                     print(self.get_table(self.depth))
															
 
																             # Disable the display feature so that we don't print the table again
															
 
																             self.display = False
															
 
																         super().__exit__(*args)
															
 
																-    def start_counting(self):
															
 
																-        self.ready = True
															
 
																-
															
 
																-    def is_ready(self):
															
 
																-        return self.ready
															
 
																-
															
 
																-    def stop_counting(self):
															
 
																-        self.ready = False
															
 
																-
															
 
																     def __torch_dispatch__(self, func, types, args=(), kwargs=None):
															
 
																-        # return the original output if not ready
															
 
																-        if not self.ready:
															
 
																-            return func(*args, **kwargs)
															
 
																-        # otherwise, count the flops and return the original output
															
 
																-        return super().__torch_dispatch__(func, types, args, kwargs)
															
 
																+        # when warmup_step is 0, count the flops and return the original output
															
 
																+        if self.warmup_step == 0:
															
 
																+            return super().__torch_dispatch__(func, types, args, kwargs)
															
 
																+        # otherwise, just return the original output
															
 
																+        return func(*args, **kwargs)
															
--- a/src/llama_recipes/utils/train_utils.py
+++ b/src/llama_recipes/utils/train_utils.py
@@ -59,9 +59,9 @@ def profile(cfg, local_rank=None):
 
																         ) as torch_profiler:
															
 
																             yield torch_profiler
															
 
																     elif use_flop_counter:
															
 
																-        if cfg.max_train_step > 0 and cfg.max_train_step < cfg.flop_counter_start:
															
 
																-            raise ValueError(f"flop counter requires at least {cfg.flop_counter_start} train steps, please increase the max_train_step, current max_train_step {cfg.max_train_step}")
															
 
																-        with FlopMeasure(rank=local_rank) as flop_counter:
															
 
																+        if cfg.max_train_step > 0 and cfg.max_train_step <= cfg.flop_counter_start:
															
 
																+            raise ValueError(f"flop counter requires at least {cfg.flop_counter_start + 1} train steps, please increase the max_train_step, current max_train_step {cfg.max_train_step}")
															
 
																+        with FlopMeasure(rank=local_rank,warmup_step=cfg.flop_counter_start) as flop_counter:
															
 
																             yield flop_counter
															
 
																     else:
															
 
																         torch_profiler = contextlib.nullcontext()
															
@@ -135,9 +135,6 @@ def train(model, train_dataloader,eval_dataloader, tokenizer, optimizer, lr_sche
 
																                         if not train_config.enable_fsdp or local_rank==0:
															
 
																                             print("max training steps reached, stopping training, total train steps finished: ", total_train_steps-1)
															
 
																                         break
															
 
																-                    if train_config.flop_counter and total_train_steps == train_config.flop_counter_start:
															
 
																-                        print("start flop counting at the step: ", total_train_steps)
															
 
																-                        profile_context.start_counting()
															
 
																                     for key in batch.keys():
															
 
																                         if train_config.enable_fsdp:
															
 
																                             if is_xpu_available():
															
@@ -183,11 +180,10 @@ def train(model, train_dataloader,eval_dataloader, tokenizer, optimizer, lr_sche
 
																                             optimizer.step()
															
 
																                             optimizer.zero_grad()
															
 
																                             pbar.update(1)
															
 
																-                    if train_config.use_profiler:
															
 
																+                    if train_config.use_profiler or train_config.flop_counter:
															
 
																                         profile_context.step()
															
 
																-                    if train_config.flop_counter and profile_context.is_ready():
															
 
																-                        TFlops = profile_context.get_total_flops() / 1e12
															
 
																-                        profile_context.stop_counting()
															
 
																+                    if train_config.flop_counter and profile_context.is_done():
															
 
																+                        TFlops = profile_context.get_flops_per_sec() / 1e12
															
 
																                     if wandb_run:
															
 
																                         if not train_config.enable_fsdp or rank==0:
															
 
																                             wandb_run.log({