3 年之前 · 5726fea665
--- a/Fine_tune_Llama_2_in_Google_Colab.ipynb
+++ b/Fine_tune_Llama_2_in_Google_Colab.ipynb
@@ -6,7 +6,7 @@
 
				       "provenance": [],
			
 
				       "machine_shape": "hm",
			
 
				       "gpuType": "V100",
			
 
				-      "authorship_tag": "ABX9TyPHtqq96zm8/DDNC9+543fd",
			
 
				+      "authorship_tag": "ABX9TyPNl/WKBYXOzuJCP/puYm6d",
			
 
				       "include_colab_link": true
			
 
				     },
			
 
				     "kernelspec": {
			
@@ -35,7 +35,9 @@
 
				         "# Fine-tune Llama 2 in Google Colab\n",
			
 
				         "> 🗣️ Large Language Model Course\n",
			
 
				         "\n",
			
 
				-        "❤️ Created by [@maximelabonne](), based on Younes Belkada's [GitHub Gist](https://gist.github.com/younesbelkada/9f7f75c94bdc1981c8ca5cc937d4a4da).\n"
			
 
				+        "❤️ Created by [@maximelabonne](), based on Younes Belkada's [GitHub Gist](https://gist.github.com/younesbelkada/9f7f75c94bdc1981c8ca5cc937d4a4da).\n",
			
 
				+        "\n",
			
 
				+        "This notebook runs on a T4 GPU with high RAM. (Last update: 23 Jul 2023)\n"
			
 
				       ],
			
 
				       "metadata": {
			
 
				         "id": "OSHlAbqzDFDq"
			
@@ -79,78 +81,110 @@
 
				     {
			
 
				       "cell_type": "code",
			
 
				       "source": [
			
 
				-        "# Used for multi-gpu\n",
			
 
				-        "local_rank = -1\n",
			
 
				-        "per_device_train_batch_size = 4\n",
			
 
				-        "per_device_eval_batch_size = 1\n",
			
 
				-        "gradient_accumulation_steps = 4\n",
			
 
				-        "learning_rate = 2e-4\n",
			
 
				-        "max_grad_norm = 0.3\n",
			
 
				-        "weight_decay = 0.001\n",
			
 
				-        "lora_alpha = 16\n",
			
 
				-        "lora_dropout = 0.1\n",
			
 
				-        "lora_r = 64\n",
			
 
				-        "max_seq_length = 512\n",
			
 
				-        "\n",
			
 
				         "# The model that you want to train from the Hugging Face hub\n",
			
 
				         "model_name = \"daryl149/llama-2-7b-chat-hf\"\n",
			
 
				         "\n",
			
 
				+        "# The instruction dataset to use\n",
			
 
				+        "dataset_name = \"mlabonne/guanaco-llama2-1k\"\n",
			
 
				+        "\n",
			
 
				         "# Fine-tuned model name\n",
			
 
				         "new_model = \"llama-2-7b-guanaco\"\n",
			
 
				         "\n",
			
 
				-        "# The instruction dataset to use\n",
			
 
				-        "dataset_name = \"timdettmers/openassistant-guanaco\"\n",
			
 
				+        "################################################################################\n",
			
 
				+        "# QLoRA parameters\n",
			
 
				+        "################################################################################\n",
			
 
				+        "\n",
			
 
				+        "# Lora attention dimension\n",
			
 
				+        "lora_r = 64\n",
			
 
				+        "\n",
			
 
				+        "# Alpha parameter for Lora scaling\n",
			
 
				+        "lora_alpha = 16\n",
			
 
				+        "\n",
			
 
				+        "# Dropout probability for Lora layers\n",
			
 
				+        "lora_dropout = 0.1\n",
			
 
				+        "\n",
			
 
				+        "################################################################################\n",
			
 
				+        "# bitsandbytes parameters\n",
			
 
				+        "################################################################################\n",
			
 
				         "\n",
			
 
				         "# Activate 4-bit precision base model loading\n",
			
 
				         "use_4bit = True\n",
			
 
				         "\n",
			
 
				-        "# Activate nested quantization for 4-bit base models\n",
			
 
				-        "use_nested_quant = False\n",
			
 
				-        "\n",
			
 
				         "# Compute dtype for 4-bit base models\n",
			
 
				         "bnb_4bit_compute_dtype = \"float16\"\n",
			
 
				         "\n",
			
 
				-        "# Quantization type (fp4 or nf4=\n",
			
 
				+        "# Quantization type (fp4 or nf4)\n",
			
 
				         "bnb_4bit_quant_type = \"nf4\"\n",
			
 
				         "\n",
			
 
				+        "# Activate nested quantization for 4-bit base models (double quantization)\n",
			
 
				+        "use_nested_quant = False\n",
			
 
				+        "\n",
			
 
				+        "################################################################################\n",
			
 
				+        "# TrainingArguments parameters\n",
			
 
				+        "################################################################################\n",
			
 
				+        "\n",
			
 
				+        "# Output directory where the model predictions and checkpoints will be stored\n",
			
 
				+        "output_dir = \"./results\"\n",
			
 
				+        "\n",
			
 
				         "# Number of training epochs\n",
			
 
				         "num_train_epochs = 1\n",
			
 
				         "\n",
			
 
				-        "# Enable fp16 training\n",
			
 
				+        "# Enable fp16/bf16 training (set bf16 to True with an A100)\n",
			
 
				         "fp16 = False\n",
			
 
				-        "\n",
			
 
				-        "# Enable bf16 training\n",
			
 
				         "bf16 = False\n",
			
 
				         "\n",
			
 
				-        "# Use packing dataset creating\n",
			
 
				-        "packing = False\n",
			
 
				+        "# Batch size per GPU for training\n",
			
 
				+        "per_device_train_batch_size = 4\n",
			
 
				+        "\n",
			
 
				+        "# Batch size per GPU for evaluation\n",
			
 
				+        "per_device_eval_batch_size = 4\n",
			
 
				+        "\n",
			
 
				+        "# Number of update steps to accumulate the gradients for\n",
			
 
				+        "gradient_accumulation_steps = 1\n",
			
 
				         "\n",
			
 
				         "# Enable gradient checkpointing\n",
			
 
				         "gradient_checkpointing = True\n",
			
 
				         "\n",
			
 
				+        "# Maximum gradient normal (gradient clipping)\n",
			
 
				+        "max_grad_norm = 0.3\n",
			
 
				+        "\n",
			
 
				+        "# Initial learning rate (AdamW optimizer)\n",
			
 
				+        "learning_rate = 2e-4\n",
			
 
				+        "\n",
			
 
				+        "# Weight decay to apply to all layers except bias/LayerNorm weights\n",
			
 
				+        "weight_decay = 0.001\n",
			
 
				+        "\n",
			
 
				         "# Optimizer to use\n",
			
 
				         "optim = \"paged_adamw_32bit\"\n",
			
 
				         "\n",
			
 
				-        "# Learning rate schedule (constant a bit better than cosine, and has advantage for analysis)\n",
			
 
				+        "# Learning rate schedule (constant a bit better than cosine)\n",
			
 
				         "lr_scheduler_type = \"constant\"\n",
			
 
				         "\n",
			
 
				-        "# Number of optimizer update steps\n",
			
 
				-        "max_steps = 10000\n",
			
 
				+        "# Number of training steps (overrides num_train_epochs)\n",
			
 
				+        "max_steps = -1\n",
			
 
				         "\n",
			
 
				-        "# Fraction of steps to do a warmup for\n",
			
 
				+        "# Ratio of steps for a linear warmup (from 0 to learning rate)\n",
			
 
				         "warmup_ratio = 0.03\n",
			
 
				         "\n",
			
 
				-        "# Group sequences into batches with same length (saves memory and speeds up training considerably)\n",
			
 
				+        "# Group sequences into batches with same length\n",
			
 
				+        "# Saves memory and speeds up training considerably\n",
			
 
				         "group_by_length = True\n",
			
 
				         "\n",
			
 
				         "# Save checkpoint every X updates steps\n",
			
 
				         "save_steps = 10\n",
			
 
				         "\n",
			
 
				         "# Log every X updates steps\n",
			
 
				-        "logging_steps = 10\n",
			
 
				+        "logging_steps = 1\n",
			
 
				         "\n",
			
 
				-        "# The output directory where the model predictions and checkpoints will be written\n",
			
 
				-        "output_dir = \"./results\"\n",
			
 
				+        "################################################################################\n",
			
 
				+        "# SFT parameters\n",
			
 
				+        "################################################################################\n",
			
 
				+        "\n",
			
 
				+        "# Maximum sequence length to use\n",
			
 
				+        "max_seq_length = None\n",
			
 
				+        "\n",
			
 
				+        "# Pack multiple short examples in the same input sequence to increase efficiency\n",
			
 
				+        "packing = False\n",
			
 
				         "\n",
			
 
				         "# Load the entire model on the GPU 0\n",
			
 
				         "device_map = {\"\": 0}"
			
@@ -164,6 +198,7 @@
 
				     {
			
 
				       "cell_type": "code",
			
 
				       "source": [
			
 
				+        "# Load dataset (you can process it here)\n",
			
 
				         "dataset = load_dataset(dataset_name, split=\"train\")\n",
			
 
				         "\n",
			
 
				         "# Load tokenizer and model with QLoRA configuration\n",
			
@@ -176,13 +211,15 @@
 
				         "    bnb_4bit_use_double_quant=use_nested_quant,\n",
			
 
				         ")\n",
			
 
				         "\n",
			
 
				+        "# Check GPU compatibility with bfloat16\n",
			
 
				         "if compute_dtype == torch.float16 and use_4bit:\n",
			
 
				         "    major, _ = torch.cuda.get_device_capability()\n",
			
 
				         "    if major >= 8:\n",
			
 
				         "        print(\"=\" * 80)\n",
			
 
				-        "        print(\"Your GPU supports bfloat16, you can accelerate training with the argument --bf16\")\n",
			
 
				+        "        print(\"Your GPU supports bfloat16: accelerate training with bf16=True\")\n",
			
 
				         "        print(\"=\" * 80)\n",
			
 
				         "\n",
			
 
				+        "# Load base model\n",
			
 
				         "model = AutoModelForCausalLM.from_pretrained(\n",
			
 
				         "    model_name,\n",
			
 
				         "    quantization_config=bnb_config,\n",
			
@@ -191,6 +228,7 @@
 
				         "model.config.use_cache = False\n",
			
 
				         "model.config.pretraining_tp = 1\n",
			
 
				         "\n",
			
 
				+        "# Load LoRA configuration\n",
			
 
				         "peft_config = LoraConfig(\n",
			
 
				         "    lora_alpha=lora_alpha,\n",
			
 
				         "    lora_dropout=lora_dropout,\n",
			
@@ -199,19 +237,22 @@
 
				         "    task_type=\"CAUSAL_LM\",\n",
			
 
				         ")\n",
			
 
				         "\n",
			
 
				+        "# Load LLaMA tokenizer\n",
			
 
				         "tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)\n",
			
 
				         "tokenizer.pad_token = tokenizer.eos_token\n",
			
 
				-        "# Fix weird overflow issue with fp16 training\n",
			
 
				-        "tokenizer.padding_side = \"right\"\n",
			
 
				+        "tokenizer.padding_side = \"right\" # Fix weird overflow issue with fp16 training\n",
			
 
				         "\n",
			
 
				+        "# Set training parameters\n",
			
 
				         "training_arguments = TrainingArguments(\n",
			
 
				         "    output_dir=output_dir,\n",
			
 
				+        "    num_train_epochs=num_train_epochs,\n",
			
 
				         "    per_device_train_batch_size=per_device_train_batch_size,\n",
			
 
				         "    gradient_accumulation_steps=gradient_accumulation_steps,\n",
			
 
				         "    optim=optim,\n",
			
 
				         "    save_steps=save_steps,\n",
			
 
				         "    logging_steps=logging_steps,\n",
			
 
				         "    learning_rate=learning_rate,\n",
			
 
				+        "    weight_decay=weight_decay,\n",
			
 
				         "    fp16=fp16,\n",
			
 
				         "    bf16=bf16,\n",
			
 
				         "    max_grad_norm=max_grad_norm,\n",
			
@@ -219,8 +260,10 @@
 
				         "    warmup_ratio=warmup_ratio,\n",
			
 
				         "    group_by_length=group_by_length,\n",
			
 
				         "    lr_scheduler_type=lr_scheduler_type,\n",
			
 
				+        "    report_to=\"tensorboard\"\n",
			
 
				         ")\n",
			
 
				         "\n",
			
 
				+        "# Set supervised fine-tuning parameters\n",
			
 
				         "trainer = SFTTrainer(\n",
			
 
				         "    model=model,\n",
			
 
				         "    train_dataset=dataset,\n",
			
@@ -232,7 +275,10 @@
 
				         "    packing=packing,\n",
			
 
				         ")\n",
			
 
				         "\n",
			
 
				+        "# Train model\n",
			
 
				         "trainer.train()\n",
			
 
				+        "\n",
			
 
				+        "# Save trained model\n",
			
 
				         "trainer.model.save_pretrained(output_dir)"
			
 
				       ],
			
 
				       "metadata": {
			
@@ -267,29 +313,21 @@
 
				     {
			
 
				       "cell_type": "code",
			
 
				       "source": [
			
 
				-        "from numba import cuda\n",
			
 
				-        "\n",
			
 
				-        "if use_4bit:\n",
			
 
				-        "    del model\n",
			
 
				-        "    torch.cuda.empty_cache()\n",
			
 
				-        "    cuda.select_device(0)\n",
			
 
				-        "    cuda.close()\n",
			
 
				-        "\n",
			
 
				-        "    base_model = AutoModelForCausalLM.from_pretrained(\n",
			
 
				-        "        model_name,\n",
			
 
				-        "        low_cpu_mem_usage=True,\n",
			
 
				-        "        return_dict=True,\n",
			
 
				-        "        torch_dtype=torch.float16,\n",
			
 
				-        "        device_map=device_map,\n",
			
 
				-        "    )\n",
			
 
				-        "    model = PeftModel.from_pretrained(base_model, output_dir, offload_folder=\"/content/sample_data\")\n",
			
 
				-        "    model = model.merge_and_unload()\n",
			
 
				-        "\n",
			
 
				-        "# Save merged weights and tokenizer\n",
			
 
				-        "model.save_pretrained(new_model, use_safetensors=True)\n",
			
 
				+        "# Reload model in FP16 and merge it with LoRA weights\n",
			
 
				+        "base_model = AutoModelForCausalLM.from_pretrained(\n",
			
 
				+        "    model_name,\n",
			
 
				+        "    low_cpu_mem_usage=True,\n",
			
 
				+        "    return_dict=True,\n",
			
 
				+        "    torch_dtype=torch.float16,\n",
			
 
				+        "    device_map=device_map,\n",
			
 
				+        ")\n",
			
 
				+        "model = PeftModel.from_pretrained(base_model, output_dir)\n",
			
 
				+        "model = model.merge_and_unload()\n",
			
 
				+        "\n",
			
 
				+        "# Reload tokenizer to save it\n",
			
 
				         "tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)\n",
			
 
				         "tokenizer.pad_token = tokenizer.eos_token\n",
			
 
				-        "tokenizer.save_pretrained(new_model)"
			
 
				+        "tokenizer.padding_side = \"right\""
			
 
				       ],
			
 
				       "metadata": {
			
 
				         "id": "QQn30cRtAZ-P"