|
@@ -0,0 +1,301 @@
|
|
|
+{
|
|
|
+ "nbformat": 4,
|
|
|
+ "nbformat_minor": 0,
|
|
|
+ "metadata": {
|
|
|
+ "colab": {
|
|
|
+ "provenance": [],
|
|
|
+ "machine_shape": "hm",
|
|
|
+ "gpuType": "V100",
|
|
|
+ "authorship_tag": "ABX9TyNKKJhDeFB7aXPizGqrvwhA",
|
|
|
+ "include_colab_link": true
|
|
|
+ },
|
|
|
+ "kernelspec": {
|
|
|
+ "name": "python3",
|
|
|
+ "display_name": "Python 3"
|
|
|
+ },
|
|
|
+ "language_info": {
|
|
|
+ "name": "python"
|
|
|
+ },
|
|
|
+ "accelerator": "GPU"
|
|
|
+ },
|
|
|
+ "cells": [
|
|
|
+ {
|
|
|
+ "cell_type": "markdown",
|
|
|
+ "metadata": {
|
|
|
+ "id": "view-in-github",
|
|
|
+ "colab_type": "text"
|
|
|
+ },
|
|
|
+ "source": [
|
|
|
+ "<a href=\"https://colab.research.google.com/github/mlabonne/llm-course/blob/main/Fine_tune_Llama_2_in_Google_Colab.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
|
|
|
+ ]
|
|
|
+ },
|
|
|
+ {
|
|
|
+ "cell_type": "markdown",
|
|
|
+ "source": [
|
|
|
+ "# Fine-tune Llama 2 in Google Colab\n",
|
|
|
+ "> 🗣️ Large Language Model Course\n",
|
|
|
+ "\n",
|
|
|
+ "❤️ Created by [@maximelabonne](), based on Pclanglais' [GitHub Gist](https://gist.github.com/Pclanglais/e90381ed142ee80c8e7ea602b18d50f0).\n"
|
|
|
+ ],
|
|
|
+ "metadata": {
|
|
|
+ "id": "OSHlAbqzDFDq"
|
|
|
+ }
|
|
|
+ },
|
|
|
+ {
|
|
|
+ "cell_type": "code",
|
|
|
+ "execution_count": null,
|
|
|
+ "metadata": {
|
|
|
+ "id": "GLXwJqbjtPho"
|
|
|
+ },
|
|
|
+ "outputs": [],
|
|
|
+ "source": [
|
|
|
+ "!pip install -q accelerate==0.21.0 peft==0.4.0 bitsandbytes==0.40.2 transformers==4.30.2 trl==0.4.7"
|
|
|
+ ]
|
|
|
+ },
|
|
|
+ {
|
|
|
+ "cell_type": "code",
|
|
|
+ "source": [
|
|
|
+ "import os\n",
|
|
|
+ "import torch\n",
|
|
|
+ "from datasets import load_dataset\n",
|
|
|
+ "from transformers import (\n",
|
|
|
+ " AutoModelForCausalLM,\n",
|
|
|
+ " AutoTokenizer,\n",
|
|
|
+ " BitsAndBytesConfig,\n",
|
|
|
+ " HfArgumentParser,\n",
|
|
|
+ " TrainingArguments,\n",
|
|
|
+ " pipeline,\n",
|
|
|
+ " logging,\n",
|
|
|
+ ")\n",
|
|
|
+ "from peft import LoraConfig, PeftModel\n",
|
|
|
+ "from trl import SFTTrainer"
|
|
|
+ ],
|
|
|
+ "metadata": {
|
|
|
+ "id": "nAMzy_0FtaUZ"
|
|
|
+ },
|
|
|
+ "execution_count": null,
|
|
|
+ "outputs": []
|
|
|
+ },
|
|
|
+ {
|
|
|
+ "cell_type": "code",
|
|
|
+ "source": [
|
|
|
+ "# Used for multi-gpu\n",
|
|
|
+ "local_rank = -1\n",
|
|
|
+ "per_device_train_batch_size = 4\n",
|
|
|
+ "per_device_eval_batch_size = 1\n",
|
|
|
+ "gradient_accumulation_steps = 4\n",
|
|
|
+ "learning_rate = 2e-4\n",
|
|
|
+ "max_grad_norm = 0.3\n",
|
|
|
+ "weight_decay = 0.001\n",
|
|
|
+ "lora_alpha = 16\n",
|
|
|
+ "lora_dropout = 0.1\n",
|
|
|
+ "lora_r = 64\n",
|
|
|
+ "max_seq_length = 512\n",
|
|
|
+ "\n",
|
|
|
+ "# The model that you want to train from the Hugging Face hub\n",
|
|
|
+ "model_name = \"daryl149/llama-2-7b-chat-hf\"\n",
|
|
|
+ "\n",
|
|
|
+ "# Fine-tuned model name\n",
|
|
|
+ "new_model = \"llama-2-7b-guanaco\"\n",
|
|
|
+ "\n",
|
|
|
+ "# The instruction dataset to use\n",
|
|
|
+ "dataset_name = \"timdettmers/openassistant-guanaco\"\n",
|
|
|
+ "\n",
|
|
|
+ "# Activate 4-bit precision base model loading\n",
|
|
|
+ "use_4bit = True\n",
|
|
|
+ "\n",
|
|
|
+ "# Activate nested quantization for 4-bit base models\n",
|
|
|
+ "use_nested_quant = False\n",
|
|
|
+ "\n",
|
|
|
+ "# Compute dtype for 4-bit base models\n",
|
|
|
+ "bnb_4bit_compute_dtype = \"float16\"\n",
|
|
|
+ "\n",
|
|
|
+ "# Quantization type (fp4 or nf4=\n",
|
|
|
+ "bnb_4bit_quant_type = \"nf4\"\n",
|
|
|
+ "\n",
|
|
|
+ "# Number of training epochs\n",
|
|
|
+ "num_train_epochs = 1\n",
|
|
|
+ "\n",
|
|
|
+ "# Enable fp16 training\n",
|
|
|
+ "fp16 = False\n",
|
|
|
+ "\n",
|
|
|
+ "# Enable bf16 training\n",
|
|
|
+ "bf16 = False\n",
|
|
|
+ "\n",
|
|
|
+ "# Use packing dataset creating\n",
|
|
|
+ "packing = False\n",
|
|
|
+ "\n",
|
|
|
+ "# Enable gradient checkpointing\n",
|
|
|
+ "gradient_checkpointing = True\n",
|
|
|
+ "\n",
|
|
|
+ "# Optimizer to use\n",
|
|
|
+ "optim = \"paged_adamw_32bit\"\n",
|
|
|
+ "\n",
|
|
|
+ "# Learning rate schedule (constant a bit better than cosine, and has advantage for analysis)\n",
|
|
|
+ "lr_scheduler_type = \"constant\"\n",
|
|
|
+ "\n",
|
|
|
+ "# Number of optimizer update steps\n",
|
|
|
+ "max_steps = 10000\n",
|
|
|
+ "\n",
|
|
|
+ "# Fraction of steps to do a warmup for\n",
|
|
|
+ "warmup_ratio = 0.03\n",
|
|
|
+ "\n",
|
|
|
+ "# Group sequences into batches with same length (saves memory and speeds up training considerably)\n",
|
|
|
+ "group_by_length = True\n",
|
|
|
+ "\n",
|
|
|
+ "# Save checkpoint every X updates steps\n",
|
|
|
+ "save_steps = 10\n",
|
|
|
+ "\n",
|
|
|
+ "# Log every X updates steps\n",
|
|
|
+ "logging_steps = 10\n",
|
|
|
+ "\n",
|
|
|
+ "# The output directory where the model predictions and checkpoints will be written\n",
|
|
|
+ "output_dir = \"./results\"\n",
|
|
|
+ "\n",
|
|
|
+ "# Load the entire model on the GPU 0\n",
|
|
|
+ "device_map = {\"\": 0}"
|
|
|
+ ],
|
|
|
+ "metadata": {
|
|
|
+ "id": "ib_We3NLtj2E"
|
|
|
+ },
|
|
|
+ "execution_count": null,
|
|
|
+ "outputs": []
|
|
|
+ },
|
|
|
+ {
|
|
|
+ "cell_type": "code",
|
|
|
+ "source": [
|
|
|
+ "dataset = load_dataset(dataset_name, split=\"train\")\n",
|
|
|
+ "\n",
|
|
|
+ "# Load tokenizer and model with QLoRA configuration\n",
|
|
|
+ "compute_dtype = getattr(torch, bnb_4bit_compute_dtype)\n",
|
|
|
+ "\n",
|
|
|
+ "bnb_config = BitsAndBytesConfig(\n",
|
|
|
+ " load_in_4bit=use_4bit,\n",
|
|
|
+ " bnb_4bit_quant_type=bnb_4bit_quant_type,\n",
|
|
|
+ " bnb_4bit_compute_dtype=compute_dtype,\n",
|
|
|
+ " bnb_4bit_use_double_quant=use_nested_quant,\n",
|
|
|
+ ")\n",
|
|
|
+ "\n",
|
|
|
+ "if compute_dtype == torch.float16 and use_4bit:\n",
|
|
|
+ " major, _ = torch.cuda.get_device_capability()\n",
|
|
|
+ " if major >= 8:\n",
|
|
|
+ " print(\"=\" * 80)\n",
|
|
|
+ " print(\"Your GPU supports bfloat16, you can accelerate training with the argument --bf16\")\n",
|
|
|
+ " print(\"=\" * 80)\n",
|
|
|
+ "\n",
|
|
|
+ "model = AutoModelForCausalLM.from_pretrained(\n",
|
|
|
+ " model_name,\n",
|
|
|
+ " quantization_config=bnb_config,\n",
|
|
|
+ " device_map=device_map\n",
|
|
|
+ ")\n",
|
|
|
+ "model.config.use_cache = False\n",
|
|
|
+ "\n",
|
|
|
+ "peft_config = LoraConfig(\n",
|
|
|
+ " lora_alpha=lora_alpha,\n",
|
|
|
+ " lora_dropout=lora_dropout,\n",
|
|
|
+ " r=lora_r,\n",
|
|
|
+ " bias=\"none\",\n",
|
|
|
+ " task_type=\"CAUSAL_LM\",\n",
|
|
|
+ ")\n",
|
|
|
+ "\n",
|
|
|
+ "tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)\n",
|
|
|
+ "tokenizer.pad_token = tokenizer.eos_token\n",
|
|
|
+ "\n",
|
|
|
+ "training_arguments = TrainingArguments(\n",
|
|
|
+ " output_dir=output_dir,\n",
|
|
|
+ " per_device_train_batch_size=per_device_train_batch_size,\n",
|
|
|
+ " gradient_accumulation_steps=gradient_accumulation_steps,\n",
|
|
|
+ " optim=optim,\n",
|
|
|
+ " save_steps=save_steps,\n",
|
|
|
+ " logging_steps=logging_steps,\n",
|
|
|
+ " learning_rate=learning_rate,\n",
|
|
|
+ " fp16=fp16,\n",
|
|
|
+ " bf16=bf16,\n",
|
|
|
+ " max_grad_norm=max_grad_norm,\n",
|
|
|
+ " max_steps=max_steps,\n",
|
|
|
+ " warmup_ratio=warmup_ratio,\n",
|
|
|
+ " group_by_length=group_by_length,\n",
|
|
|
+ " lr_scheduler_type=lr_scheduler_type,\n",
|
|
|
+ ")\n",
|
|
|
+ "\n",
|
|
|
+ "trainer = SFTTrainer(\n",
|
|
|
+ " model=model,\n",
|
|
|
+ " train_dataset=dataset,\n",
|
|
|
+ " peft_config=peft_config,\n",
|
|
|
+ " dataset_text_field=\"text\",\n",
|
|
|
+ " max_seq_length=max_seq_length,\n",
|
|
|
+ " tokenizer=tokenizer,\n",
|
|
|
+ " args=training_arguments,\n",
|
|
|
+ " packing=packing,\n",
|
|
|
+ ")\n",
|
|
|
+ "\n",
|
|
|
+ "trainer.train()\n",
|
|
|
+ "trainer.model.save_pretrained(output_dir)"
|
|
|
+ ],
|
|
|
+ "metadata": {
|
|
|
+ "id": "OJXpOgBFuSrc"
|
|
|
+ },
|
|
|
+ "execution_count": null,
|
|
|
+ "outputs": []
|
|
|
+ },
|
|
|
+ {
|
|
|
+ "cell_type": "code",
|
|
|
+ "source": [
|
|
|
+ "logging.set_verbosity(logging.CRITICAL)\n",
|
|
|
+ "pipe = pipeline(task=\"text-generation\", model=model, tokenizer=tokenizer, max_length=200)\n",
|
|
|
+ "result = pipe(\"Tell me a joke\")\n",
|
|
|
+ "print(result[0]['generated_text'])"
|
|
|
+ ],
|
|
|
+ "metadata": {
|
|
|
+ "id": "frlSLPin4IJ4"
|
|
|
+ },
|
|
|
+ "execution_count": null,
|
|
|
+ "outputs": []
|
|
|
+ },
|
|
|
+ {
|
|
|
+ "cell_type": "code",
|
|
|
+ "source": [
|
|
|
+ "from numba import cuda\n",
|
|
|
+ "\n",
|
|
|
+ "if use_4bit:\n",
|
|
|
+ " del model\n",
|
|
|
+ " torch.cuda.empty_cache()\n",
|
|
|
+ " cuda.select_device(0)\n",
|
|
|
+ " cuda.close()\n",
|
|
|
+ "\n",
|
|
|
+ " base_model = AutoModelForCausalLM.from_pretrained(\n",
|
|
|
+ " model_name,\n",
|
|
|
+ " low_cpu_mem_usage=True,\n",
|
|
|
+ " return_dict=True,\n",
|
|
|
+ " torch_dtype=torch.float16,\n",
|
|
|
+ " device_map=device_map,\n",
|
|
|
+ " )\n",
|
|
|
+ " model = PeftModel.from_pretrained(base_model, \"./adapter\", offload_folder=\"/content/sample_data\")\n",
|
|
|
+ " model = model.merge_and_unload()\n",
|
|
|
+ "\n",
|
|
|
+ "# Save merged weights and tokenizer\n",
|
|
|
+ "model.save_pretrained(new_model, use_safetensors=True)\n",
|
|
|
+ "tokenizer.save_pretrained(new_model)"
|
|
|
+ ],
|
|
|
+ "metadata": {
|
|
|
+ "id": "QQn30cRtAZ-P"
|
|
|
+ },
|
|
|
+ "execution_count": null,
|
|
|
+ "outputs": []
|
|
|
+ },
|
|
|
+ {
|
|
|
+ "cell_type": "code",
|
|
|
+ "source": [
|
|
|
+ "!huggingface-cli login\n",
|
|
|
+ "\n",
|
|
|
+ "model.push_to_hub(new_model, use_temp_dir=False)\n",
|
|
|
+ "tokenizer.push_to_hub(new_model, use_temp_dir=False)"
|
|
|
+ ],
|
|
|
+ "metadata": {
|
|
|
+ "id": "x-xPb-_qB0dz"
|
|
|
+ },
|
|
|
+ "execution_count": null,
|
|
|
+ "outputs": []
|
|
|
+ }
|
|
|
+ ]
|
|
|
+}
|