{ "nbformat": 4, "nbformat_minor": 0, "metadata": { "colab": { "provenance": [], "machine_shape": "hm", "gpuType": "V100", "authorship_tag": "ABX9TyMgfvtuquE8AUCpv0te8LOT", "include_colab_link": true }, "kernelspec": { "name": "python3", "display_name": "Python 3" }, "language_info": { "name": "python" }, "accelerator": "GPU" }, "cells": [ { "cell_type": "markdown", "metadata": { "id": "view-in-github", "colab_type": "text" }, "source": [ "\"Open" ] }, { "cell_type": "markdown", "source": [ "# Fine-tune Llama 2 in Google Colab\n", "> 🗣️ Large Language Model Course\n", "\n", "❤️ Created by [@maximelabonne](), based on Pclanglais' [GitHub Gist](https://gist.github.com/Pclanglais/e90381ed142ee80c8e7ea602b18d50f0).\n" ], "metadata": { "id": "OSHlAbqzDFDq" } }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "GLXwJqbjtPho" }, "outputs": [], "source": [ "!pip install -q accelerate==0.21.0 peft==0.4.0 bitsandbytes==0.40.2 transformers==4.30.2 trl==0.4.7" ] }, { "cell_type": "code", "source": [ "import os\n", "import torch\n", "from datasets import load_dataset\n", "from transformers import (\n", " AutoModelForCausalLM,\n", " AutoTokenizer,\n", " BitsAndBytesConfig,\n", " HfArgumentParser,\n", " TrainingArguments,\n", " pipeline,\n", " logging,\n", ")\n", "from peft import LoraConfig, PeftModel\n", "from trl import SFTTrainer" ], "metadata": { "id": "nAMzy_0FtaUZ" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "# Used for multi-gpu\n", "local_rank = -1\n", "per_device_train_batch_size = 4\n", "per_device_eval_batch_size = 1\n", "gradient_accumulation_steps = 4\n", "learning_rate = 2e-4\n", "max_grad_norm = 0.3\n", "weight_decay = 0.001\n", "lora_alpha = 16\n", "lora_dropout = 0.1\n", "lora_r = 64\n", "max_seq_length = 512\n", "\n", "# The model that you want to train from the Hugging Face hub\n", "model_name = \"daryl149/llama-2-7b-chat-hf\"\n", "\n", "# Fine-tuned model name\n", "new_model = \"llama-2-7b-guanaco\"\n", "\n", "# The instruction dataset to use\n", "dataset_name = \"timdettmers/openassistant-guanaco\"\n", "\n", "# Activate 4-bit precision base model loading\n", "use_4bit = True\n", "\n", "# Activate nested quantization for 4-bit base models\n", "use_nested_quant = False\n", "\n", "# Compute dtype for 4-bit base models\n", "bnb_4bit_compute_dtype = \"float16\"\n", "\n", "# Quantization type (fp4 or nf4=\n", "bnb_4bit_quant_type = \"nf4\"\n", "\n", "# Number of training epochs\n", "num_train_epochs = 1\n", "\n", "# Enable fp16 training\n", "fp16 = False\n", "\n", "# Enable bf16 training\n", "bf16 = False\n", "\n", "# Use packing dataset creating\n", "packing = False\n", "\n", "# Enable gradient checkpointing\n", "gradient_checkpointing = True\n", "\n", "# Optimizer to use\n", "optim = \"paged_adamw_32bit\"\n", "\n", "# Learning rate schedule (constant a bit better than cosine, and has advantage for analysis)\n", "lr_scheduler_type = \"constant\"\n", "\n", "# Number of optimizer update steps\n", "max_steps = 10000\n", "\n", "# Fraction of steps to do a warmup for\n", "warmup_ratio = 0.03\n", "\n", "# Group sequences into batches with same length (saves memory and speeds up training considerably)\n", "group_by_length = True\n", "\n", "# Save checkpoint every X updates steps\n", "save_steps = 10\n", "\n", "# Log every X updates steps\n", "logging_steps = 10\n", "\n", "# The output directory where the model predictions and checkpoints will be written\n", "output_dir = \"./results\"\n", "\n", "# Load the entire model on the GPU 0\n", "device_map = {\"\": 0}" ], "metadata": { "id": "ib_We3NLtj2E" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "dataset = load_dataset(dataset_name, split=\"train\")\n", "\n", "# Load tokenizer and model with QLoRA configuration\n", "compute_dtype = getattr(torch, bnb_4bit_compute_dtype)\n", "\n", "bnb_config = BitsAndBytesConfig(\n", " load_in_4bit=use_4bit,\n", " bnb_4bit_quant_type=bnb_4bit_quant_type,\n", " bnb_4bit_compute_dtype=compute_dtype,\n", " bnb_4bit_use_double_quant=use_nested_quant,\n", ")\n", "\n", "if compute_dtype == torch.float16 and use_4bit:\n", " major, _ = torch.cuda.get_device_capability()\n", " if major >= 8:\n", " print(\"=\" * 80)\n", " print(\"Your GPU supports bfloat16, you can accelerate training with the argument --bf16\")\n", " print(\"=\" * 80)\n", "\n", "model = AutoModelForCausalLM.from_pretrained(\n", " model_name,\n", " quantization_config=bnb_config,\n", " device_map=device_map\n", ")\n", "model.config.use_cache = False\n", "\n", "peft_config = LoraConfig(\n", " lora_alpha=lora_alpha,\n", " lora_dropout=lora_dropout,\n", " r=lora_r,\n", " bias=\"none\",\n", " task_type=\"CAUSAL_LM\",\n", ")\n", "\n", "tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)\n", "tokenizer.pad_token = tokenizer.eos_token\n", "\n", "training_arguments = TrainingArguments(\n", " output_dir=output_dir,\n", " per_device_train_batch_size=per_device_train_batch_size,\n", " gradient_accumulation_steps=gradient_accumulation_steps,\n", " optim=optim,\n", " save_steps=save_steps,\n", " logging_steps=logging_steps,\n", " learning_rate=learning_rate,\n", " fp16=fp16,\n", " bf16=bf16,\n", " max_grad_norm=max_grad_norm,\n", " max_steps=max_steps,\n", " warmup_ratio=warmup_ratio,\n", " group_by_length=group_by_length,\n", " lr_scheduler_type=lr_scheduler_type,\n", ")\n", "\n", "trainer = SFTTrainer(\n", " model=model,\n", " train_dataset=dataset,\n", " peft_config=peft_config,\n", " dataset_text_field=\"text\",\n", " max_seq_length=max_seq_length,\n", " tokenizer=tokenizer,\n", " args=training_arguments,\n", " packing=packing,\n", ")\n", "\n", "trainer.train()\n", "trainer.model.save_pretrained(output_dir)" ], "metadata": { "id": "OJXpOgBFuSrc" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "logging.set_verbosity(logging.CRITICAL)\n", "pipe = pipeline(task=\"text-generation\", model=model, tokenizer=tokenizer, max_length=200)\n", "result = pipe(\"Tell me a joke\")\n", "print(result[0]['generated_text'])" ], "metadata": { "id": "frlSLPin4IJ4" }, "execution_count": null, "outputs": [] }, { "cell_type": "markdown", "source": [ "There is a problem with the VRAM here despite `del model` and emptying the VRAM. You probably need to restart the notebook, re-execute the three first cells, and then execute this one. Please contact me if you have a fix!" ], "metadata": { "id": "6WjzALHtSfdb" } }, { "cell_type": "code", "source": [ "from numba import cuda\n", "\n", "if use_4bit:\n", " del model\n", " torch.cuda.empty_cache()\n", " cuda.select_device(0)\n", " cuda.close()\n", "\n", " base_model = AutoModelForCausalLM.from_pretrained(\n", " model_name,\n", " low_cpu_mem_usage=True,\n", " return_dict=True,\n", " torch_dtype=torch.float16,\n", " device_map=device_map,\n", " )\n", " model = PeftModel.from_pretrained(base_model, output_dir, offload_folder=\"/content/sample_data\")\n", " model = model.merge_and_unload()\n", "\n", "# Save merged weights and tokenizer\n", "model.save_pretrained(new_model, use_safetensors=True)\n", "tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)\n", "tokenizer.pad_token = tokenizer.eos_token\n", "tokenizer.save_pretrained(new_model)" ], "metadata": { "id": "QQn30cRtAZ-P" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "!huggingface-cli login\n", "\n", "model.push_to_hub(new_model, use_temp_dir=False)\n", "tokenizer.push_to_hub(new_model, use_temp_dir=False)" ], "metadata": { "id": "x-xPb-_qB0dz" }, "execution_count": null, "outputs": [] } ] }