{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "f68aee84-04e3-4cbc-be78-6de9e06e704f",
   "metadata": {},
   "source": [
    "Notebook for uploading PDF, extracting all Text and Pre-Processing using a 1B or 3B model"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "f4fc7aef-3505-482e-a998-790b8b9d48e4",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Collecting PyPDF2\n",
      "  Downloading pypdf2-3.0.1-py3-none-any.whl.metadata (6.8 kB)\n",
      "Downloading pypdf2-3.0.1-py3-none-any.whl (232 kB)\n",
      "Installing collected packages: PyPDF2\n",
      "Successfully installed PyPDF2-3.0.1\n"
     ]
    }
   ],
   "source": [
    "!pip install PyPDF2"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "id": "60d0061b-8b8c-4353-850f-f19466a0ae2d",
   "metadata": {},
   "outputs": [],
   "source": [
    "pdf_path = './2402.13116v3.pdf'\n",
    "DEFAULT_MODEL = \"meta-llama/Llama-3.2-1B-Instruct\"\n",
    "#DEFAULT_MODEL = \"meta-llama/Llama-3.2-1B-Instruct\" <- Don't think this would be necessary"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 31,
   "id": "9418ac5e-df65-4c03-ac64-48a1275afa39",
   "metadata": {},
   "outputs": [],
   "source": [
    "from difflib import HtmlDiff\n",
    "from IPython.display import HTML, display"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "id": "21029232-ac5f-42ca-b26b-baad5b2f49b7",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Import necessary libraries\n",
    "import PyPDF2\n",
    "from typing import Optional\n",
    "import os\n",
    "import torch\n",
    "from accelerate import Accelerator\n",
    "from transformers import AutoModelForCausalLM, AutoTokenizer\n",
    "from tqdm.notebook import tqdm\n",
    "import warnings\n",
    "\n",
    "accelerator = Accelerator()\n",
    "device = accelerator.device\n",
    "\n",
    "warnings.filterwarnings('ignore')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "id": "153d9ece-37a4-4fff-a8e8-53f923a2b0a0",
   "metadata": {},
   "outputs": [],
   "source": [
    "def validate_pdf(file_path: str) -> bool:\n",
    "    if not os.path.exists(file_path):\n",
    "        print(f\"Error: File not found at path: {file_path}\")\n",
    "        return False\n",
    "    if not file_path.lower().endswith('.pdf'):\n",
    "        print(\"Error: File is not a PDF\")\n",
    "        return False\n",
    "    return True"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "id": "b57c2d64-3d75-4aeb-b4ee-bd1661286b66",
   "metadata": {},
   "outputs": [],
   "source": [
    "def extract_text_from_pdf(file_path: str, max_chars: int = 100000) -> Optional[str]:\n",
    "    if not validate_pdf(file_path):\n",
    "        return None\n",
    "    \n",
    "    try:\n",
    "        with open(file_path, 'rb') as file:\n",
    "            # Create PDF reader object\n",
    "            pdf_reader = PyPDF2.PdfReader(file)\n",
    "            \n",
    "            # Get total number of pages\n",
    "            num_pages = len(pdf_reader.pages)\n",
    "            print(f\"Processing PDF with {num_pages} pages...\")\n",
    "            \n",
    "            extracted_text = []\n",
    "            total_chars = 0\n",
    "            \n",
    "            # Iterate through all pages\n",
    "            for page_num in range(num_pages):\n",
    "                # Extract text from page\n",
    "                page = pdf_reader.pages[page_num]\n",
    "                text = page.extract_text()\n",
    "                \n",
    "                # Check if adding this page's text would exceed the limit\n",
    "                if total_chars + len(text) > max_chars:\n",
    "                    # Only add text up to the limit\n",
    "                    remaining_chars = max_chars - total_chars\n",
    "                    extracted_text.append(text[:remaining_chars])\n",
    "                    print(f\"Reached {max_chars} character limit at page {page_num + 1}\")\n",
    "                    break\n",
    "                \n",
    "                extracted_text.append(text)\n",
    "                total_chars += len(text)\n",
    "                print(f\"Processed page {page_num + 1}/{num_pages}\")\n",
    "            \n",
    "            final_text = '\\n'.join(extracted_text)\n",
    "            print(f\"\\nExtraction complete! Total characters: {len(final_text)}\")\n",
    "            return final_text\n",
    "            \n",
    "    except PyPDF2.PdfReadError:\n",
    "        print(\"Error: Invalid or corrupted PDF file\")\n",
    "        return None\n",
    "    except Exception as e:\n",
    "        print(f\"An unexpected error occurred: {str(e)}\")\n",
    "        return None\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "id": "0984bb1e-d52c-4cec-a131-67a48061fabc",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Get PDF metadata\n",
    "def get_pdf_metadata(file_path: str) -> Optional[dict]:\n",
    "    if not validate_pdf(file_path):\n",
    "        return None\n",
    "    \n",
    "    try:\n",
    "        with open(file_path, 'rb') as file:\n",
    "            pdf_reader = PyPDF2.PdfReader(file)\n",
    "            metadata = {\n",
    "                'num_pages': len(pdf_reader.pages),\n",
    "                'metadata': pdf_reader.metadata\n",
    "            }\n",
    "            return metadata\n",
    "    except Exception as e:\n",
    "        print(f\"Error extracting metadata: {str(e)}\")\n",
    "        return None"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "id": "63848943-79cc-4e21-8396-6eab5df493e0",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Extracting metadata...\n",
      "\n",
      "PDF Metadata:\n",
      "Number of pages: 44\n",
      "Document info:\n",
      "/Author: \n",
      "/CreationDate: D:20240311015030Z\n",
      "/Creator: LaTeX with hyperref\n",
      "/Keywords: \n",
      "/ModDate: D:20240311015030Z\n",
      "/PTEX.Fullbanner: This is pdfTeX, Version 3.141592653-2.6-1.40.25 (TeX Live 2023) kpathsea version 6.3.5\n",
      "/Producer: pdfTeX-1.40.25\n",
      "/Subject: \n",
      "/Title: \n",
      "/Trapped: /False\n",
      "\n",
      "Extracting text...\n",
      "Processing PDF with 44 pages...\n",
      "Processed page 1/44\n",
      "Processed page 2/44\n",
      "Processed page 3/44\n",
      "Processed page 4/44\n",
      "Processed page 5/44\n",
      "Processed page 6/44\n",
      "Processed page 7/44\n",
      "Processed page 8/44\n",
      "Processed page 9/44\n",
      "Processed page 10/44\n",
      "Processed page 11/44\n",
      "Processed page 12/44\n",
      "Processed page 13/44\n",
      "Processed page 14/44\n",
      "Processed page 15/44\n",
      "Processed page 16/44\n",
      "Reached 100000 character limit at page 17\n",
      "\n",
      "Extraction complete! Total characters: 100016\n",
      "\n",
      "Preview of extracted text (first 500 characters):\n",
      "--------------------------------------------------\n",
      "1\n",
      "A Survey on Knowledge Distillation of Large\n",
      "Language Models\n",
      "Xiaohan Xu1, Ming Li2, Chongyang Tao3, Tao Shen4, Reynold Cheng1, Jinyang Li1,\n",
      "Can Xu5, Dacheng Tao6, Tianyi Zhou2\n",
      "1The University of Hong Kong2University of Maryland3Microsoft\n",
      "4University of Technology Sydney5Peking University6The University of Sydney\n",
      "{shawnxxh,chongyangtao,hishentao }@gmail.com {minglii,tianyi }@umd.edu\n",
      "ckcheng@cs.hku.hk jl0725@connect.hku.hk\n",
      "Abstract —In the era of Large Language Models (LLMs), Knowledge Distillati\n",
      "--------------------------------------------------\n",
      "\n",
      "Total characters extracted: 100016\n",
      "\n",
      "Extracted text has been saved to extracted_text.txt\n"
     ]
    }
   ],
   "source": [
    "# Extract metadata first\n",
    "print(\"Extracting metadata...\")\n",
    "metadata = get_pdf_metadata(pdf_path)\n",
    "if metadata:\n",
    "    print(\"\\nPDF Metadata:\")\n",
    "    print(f\"Number of pages: {metadata['num_pages']}\")\n",
    "    print(\"Document info:\")\n",
    "    for key, value in metadata['metadata'].items():\n",
    "        print(f\"{key}: {value}\")\n",
    "\n",
    "# Extract text\n",
    "print(\"\\nExtracting text...\")\n",
    "extracted_text = extract_text_from_pdf(pdf_path)\n",
    "\n",
    "# Display first 500 characters of extracted text as preview\n",
    "if extracted_text:\n",
    "    print(\"\\nPreview of extracted text (first 500 characters):\")\n",
    "    print(\"-\" * 50)\n",
    "    print(extracted_text[:500])\n",
    "    print(\"-\" * 50)\n",
    "    print(f\"\\nTotal characters extracted: {len(extracted_text)}\")\n",
    "\n",
    "# Optional: Save the extracted text to a file\n",
    "if extracted_text:\n",
    "    output_file = 'extracted_text.txt'\n",
    "    with open(output_file, 'w', encoding='utf-8') as f:\n",
    "        f.write(extracted_text)\n",
    "    print(f\"\\nExtracted text has been saved to {output_file}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "id": "7c0828a5-964d-475e-b5f5-40a04e287725",
   "metadata": {},
   "outputs": [],
   "source": [
    "device = \"cuda\" if torch.cuda.is_available() else \"cpu\"\n",
    "\n",
    "SYS_PROMPT = \"\"\"\n",
    "You are a world class text pre-processor, here is the raw data from a PDF, please parse and return it in a way that is crispy and usable to send to a podcast writer.\n",
    "\n",
    "The raw data is messed up with new lines, Latex math and you will see fluff that we can remove completely. Basically take away any details that you think might be useless in a podcast author's transcript.\n",
    "\n",
    "Remember, the podcast could be on any topic whatsoever so the issues listed above are not exhaustive\n",
    "\n",
    "The goal is to use this in a podcast research transcript so a lot of the emails, citations, and things like that can be removed-please be smart with what you remove and be creative ok?\n",
    "\n",
    "Remember DO NOT START SUMMARIZING THIS, YOU ARE ONLY CLEANING UP THE TEXT AND RETURNING AS IS\n",
    "\n",
    "Be very smart and aggressive with removing details, you will get a running portion of the text and keep returning the processed text.\n",
    "\n",
    "ALWAYS start your response directly with processed text and NO ACKNOWLEDGEMENTS about my questions ok?\n",
    "Here is the text:\n",
    "\"\"\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "id": "d04a4f07-b0b3-45ca-8f41-a433e1abe050",
   "metadata": {},
   "outputs": [],
   "source": [
    "accelerator = Accelerator()\n",
    "model = AutoModelForCausalLM.from_pretrained(\n",
    "    DEFAULT_MODEL,\n",
    "    torch_dtype=torch.bfloat16,\n",
    "    use_safetensors=True,\n",
    "    device_map=device,\n",
    ")\n",
    "tokenizer = AutoTokenizer.from_pretrained(DEFAULT_MODEL, use_safetensors=True)\n",
    "model, tokenizer = accelerator.prepare(model, tokenizer)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 32,
   "id": "f6ed2dd9-c9b5-4d0a-8cc2-ab184ce5ad68",
   "metadata": {},
   "outputs": [],
   "source": [
    "def create_html_diff(text1, text2, chunk_num):\n",
    "    \"\"\"Create HTML diff between two texts\"\"\"\n",
    "    # Wrap text to make it more readable\n",
    "    text1_lines = textwrap.wrap(text1, width=80)\n",
    "    text2_lines = textwrap.wrap(text2, width=80)\n",
    "    \n",
    "    # Create diff\n",
    "    diff = HtmlDiff(wrapcolumn=80)\n",
    "    html = diff.make_file(\n",
    "        text1_lines, \n",
    "        text2_lines,\n",
    "        fromdesc=f\"Original (Chunk {chunk_num})\",\n",
    "        todesc=f\"Processed (Chunk {chunk_num})\",\n",
    "        context=True\n",
    "    )\n",
    "    \n",
    "    return html"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 34,
   "id": "bbda5241-e890-4402-87dd-514d6761bb9c",
   "metadata": {},
   "outputs": [],
   "source": [
    "def process_chunk(text_chunk):\n",
    "    conversation = [\n",
    "        {\"role\": \"system\", \"content\": SYS_PROMPT},\n",
    "        {\"role\": \"user\", \"content\": text_chunk},\n",
    "    ]\n",
    "    \n",
    "    prompt = tokenizer.apply_chat_template(conversation, tokenize=False)\n",
    "    inputs = tokenizer(prompt, return_tensors=\"pt\").to(device)\n",
    "    \n",
    "    with torch.no_grad():  # Add this for efficiency\n",
    "        output = model.generate(\n",
    "            **inputs,\n",
    "            temperature=0.7,\n",
    "            top_p=0.9,\n",
    "            max_new_tokens=8126\n",
    "        )\n",
    "\n",
    "    diff_html = create_html_diff(text_chunk, processed_text, chunk_num)\n",
    "    \n",
    "    return tokenizer.decode(output[0], skip_special_tokens=True)[len(prompt):].strip()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 35,
   "id": "5311a77f-c98c-4009-a982-c4393fd64fa4",
   "metadata": {},
   "outputs": [],
   "source": [
    "INPUT_FILE = \"./extracted_text.txt\"  # Replace with your file path\n",
    "CHUNK_SIZE = 1000\n",
    "output_file = f\"clean_{os.path.basename(INPUT_FILE)}\"\n",
    "diff_file = f\"diff_{os.path.basename(INPUT_FILE)}.html\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 36,
   "id": "b32fdfb5-85ab-4a62-b1ea-0f28e4d9ba44",
   "metadata": {},
   "outputs": [
    {
     "ename": "KeyError",
     "evalue": "' font-family'",
     "output_type": "error",
     "traceback": [
      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[0;31mKeyError\u001b[0m                                  Traceback (most recent call last)",
      "Cell \u001b[0;32mIn[36], line 22\u001b[0m\n\u001b[1;32m      1\u001b[0m html_header \u001b[38;5;241m=\u001b[39m \u001b[38;5;124m\"\"\"\u001b[39m\n\u001b[1;32m      2\u001b[0m \u001b[38;5;124m<html>\u001b[39m\n\u001b[1;32m      3\u001b[0m \u001b[38;5;124m<head>\u001b[39m\n\u001b[1;32m      4\u001b[0m \u001b[38;5;124m<style>\u001b[39m\n\u001b[1;32m      5\u001b[0m \u001b[38;5;124m    body \u001b[39m\u001b[38;5;124m{\u001b[39m\u001b[38;5;124m font-family: Arial, sans-serif; margin: 20px; }\u001b[39m\n\u001b[1;32m      6\u001b[0m \u001b[38;5;124m    .diff_header \u001b[39m\u001b[38;5;124m{\u001b[39m\u001b[38;5;124m background-color: #f8f9fa; padding: 5px; }\u001b[39m\n\u001b[1;32m      7\u001b[0m \u001b[38;5;124m    table.diff \u001b[39m\u001b[38;5;124m{\u001b[39m\u001b[38;5;124m font-family: monospace; border-collapse: collapse; width: 100\u001b[39m\u001b[38;5;124m%\u001b[39m\u001b[38;5;124m; }\u001b[39m\n\u001b[1;32m      8\u001b[0m \u001b[38;5;124m    .diff_next \u001b[39m\u001b[38;5;124m{\u001b[39m\u001b[38;5;124m background-color: #f8f9fa; padding: 5px; }\u001b[39m\n\u001b[1;32m      9\u001b[0m \u001b[38;5;124m    .diff_add \u001b[39m\u001b[38;5;124m{\u001b[39m\u001b[38;5;124m background-color: #e6ffe6; }\u001b[39m\n\u001b[1;32m     10\u001b[0m \u001b[38;5;124m    .diff_chg \u001b[39m\u001b[38;5;124m{\u001b[39m\u001b[38;5;124m background-color: #fff3cd; }\u001b[39m\n\u001b[1;32m     11\u001b[0m \u001b[38;5;124m    .diff_sub \u001b[39m\u001b[38;5;124m{\u001b[39m\u001b[38;5;124m background-color: #ffe6e6; }\u001b[39m\n\u001b[1;32m     12\u001b[0m \u001b[38;5;124m    td \u001b[39m\u001b[38;5;124m{\u001b[39m\u001b[38;5;124m padding: 5px; border: 1px solid #ddd; }\u001b[39m\n\u001b[1;32m     13\u001b[0m \u001b[38;5;124m    th \u001b[39m\u001b[38;5;124m{\u001b[39m\u001b[38;5;124m padding: 10px; background-color: #f8f9fa; }\u001b[39m\n\u001b[1;32m     14\u001b[0m \u001b[38;5;124m    .chunk-separator \u001b[39m\u001b[38;5;124m{\u001b[39m\u001b[38;5;124m margin: 20px 0; border-top: 2px solid #ddd; }\u001b[39m\n\u001b[1;32m     15\u001b[0m \u001b[38;5;124m</style>\u001b[39m\n\u001b[1;32m     16\u001b[0m \u001b[38;5;124m</head>\u001b[39m\n\u001b[1;32m     17\u001b[0m \u001b[38;5;124m<body>\u001b[39m\n\u001b[1;32m     18\u001b[0m \u001b[38;5;124m<h1>Text Processing Diff Report</h1>\u001b[39m\n\u001b[1;32m     19\u001b[0m \u001b[38;5;124m<p>Input file: \u001b[39m\u001b[38;5;132;01m{}\u001b[39;00m\u001b[38;5;124m</p>\u001b[39m\n\u001b[1;32m     20\u001b[0m \u001b[38;5;124m<p>Output file: \u001b[39m\u001b[38;5;132;01m{}\u001b[39;00m\u001b[38;5;124m</p>\u001b[39m\n\u001b[1;32m     21\u001b[0m \u001b[38;5;124m<p>Total chunks: \u001b[39m\u001b[38;5;132;01m{}\u001b[39;00m\u001b[38;5;124m</p>\u001b[39m\n\u001b[0;32m---> 22\u001b[0m \u001b[38;5;124m\"\"\"\u001b[39m\u001b[38;5;241m.\u001b[39mformat(INPUT_FILE, output_file, num_chunks)\n",
      "\u001b[0;31mKeyError\u001b[0m: ' font-family'"
     ]
    }
   ],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "id": "46e160e8-552a-43b2-9f9e-a7a52092318f",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "22f0c0486878477f928833db31490ca7",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Processing chunks:   0%|          | 0/101 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Setting `pad_token_id` to `eos_token_id`:None for open-end generation.\n",
      "Setting `pad_token_id` to `eos_token_id`:None for open-end generation.\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "INPUT TEXT:\n",
      "1\n",
      "A Survey on Knowledge Distillation of Large\n",
      "Language Models\n",
      "Xiaohan Xu1, Ming Li2, Chongyang Tao3, Tao Shen4, Reynold Cheng1, Jinyang Li1,\n",
      "Can Xu5, Dacheng Tao6, Tianyi Zhou2\n",
      "1The University of Hong Kong2University of Maryland3Microsoft\n",
      "4University of Technology Sydney5Peking University6The University of Sydney\n",
      "{shawnxxh,chongyangtao,hishentao }@gmail.com {minglii,tianyi }@umd.edu\n",
      "ckcheng@cs.hku.hk jl0725@connect.hku.hk\n",
      "Abstract —In the era of Large Language Models (LLMs), Knowledge Distillati...\n",
      "\n",
      "PROCESSED TEXT:\n",
      "...\n",
      "==========================================================================================\n",
      "\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Setting `pad_token_id` to `eos_token_id`:None for open-end generation.\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "INPUT TEXT:\n",
      "ed knowledge to smaller models and its utility in model compression and self-\n",
      "improvement. Our survey is meticulously structured around three foundational pillars: algorithm ,skill, and verticalization – providing\n",
      "a comprehensive examination of KD mechanisms, the enhancement of specific cognitive abilities, and their practical implications\n",
      "across diverse fields. Crucially, the survey navigates the intricate interplay between data augmentation (DA) and KD, illustrating how\n",
      "DA emerges as a powerfu...\n",
      "\n",
      "PROCESSED TEXT:\n",
      "Tao Shen4, Reynold Cheng1, Jinyang Li1,\n",
      "Can Xu5, Dacheng Tao6, Tianyi Zhou2\n",
      "1The University of Hong Kong2University of Maryland3Microsoft\n",
      "4University of Technology Sydney5Peking University6The University of Sydney\n",
      "{shawnxxh,chongyangtao,hishentao }@gmail.com {minglii,tianyi }@umd.edu\n",
      "ckcheng@cs.hku.hk\n",
      "...\n",
      "==========================================================================================\n",
      "\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Setting `pad_token_id` to `eos_token_id`:None for open-end generation.\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "INPUT TEXT:\n",
      "on and\n",
      "proposing future research directions. By bridging the gap between proprietary and open-source LLMs, this survey underscores the\n",
      "potential for more accessible, efficient, and powerful AI solutions. Most importantly, we firmly advocate for compliance with the legal\n",
      "terms that regulate the use of LLMs, ensuring ethical and lawful application of KD of LLMs. An associated Github repository is available\n",
      "at https://github.com/Tebmer/Awesome-Knowledge-Distillation-of-LLMs.\n",
      "Index Terms —Large lang...\n",
      "\n",
      "PROCESSED TEXT:\n",
      "Tao Shen4, Reynold Cheng1, Jinyang Li1,\n",
      "Can Xu5, Dacheng Tao6, Tianyi Zhou2\n",
      "1The University of Hong Kong2University of Maryland3Microsoft\n",
      "4University of Technology Sydney5Peking University6The University of Sydney\n",
      "{shawnxxh,chongyangtao,hishentao }@gmail.com {minglii,tianyi }@umd.edu\n",
      "ckcheng@cs.hku.hk\n",
      "is meticulously structured around three foundational pillars: algorithm, skill, and verticalization – providing a comprehensive examination of knowledge distillation mechanisms, the enhancement of ...\n",
      "==========================================================================================\n",
      "\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Setting `pad_token_id` to `eos_token_id`:None for open-end generation.\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "INPUT TEXT:\n",
      " have un-\n",
      "locked new realms of possibility, from generating human-\n",
      "like text to offering sophisticated problem-solving capa-\n",
      "bilities. The core significance of these LLMs lies in their\n",
      "emergent abilities (Wei et al., 2022a,b; Xu et al., 2024a), a\n",
      "phenomenon where the models display capabilities beyond\n",
      "their explicit training objectives, enabling them to tackle a\n",
      "diverse array of tasks with remarkable proficiency. Their\n",
      "deep understanding of context, nuance, and the intrica-\n",
      "cies of human languag...\n",
      "\n",
      "PROCESSED TEXT:\n",
      "Tao Shen4, Reynold Cheng1, Jinyang Li1,\n",
      "Can Xu5, Dacheng Tao6, Tianyi Zhou2\n",
      "1The University of Hong Kong2University of Maryland3Microsoft\n",
      "4University of Technology Sydney5Peking University6The University of Sydney\n",
      "{shawnxxh,chongyangtao,hishentao }@gmail.com {minglii,tianyi }@umd.edu\n",
      "ckcheng@cs.hku.hk\n",
      "is meticulously structured around three foundational pillars: algorithm, skill, and verticalization – providing a comprehensive examination of knowledge distillation mechanisms, the enhancement of ...\n",
      "==========================================================================================\n",
      "\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Setting `pad_token_id` to `eos_token_id`:None for open-end generation.\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "INPUT TEXT:\n",
      "g to revolutionize industries,\n",
      "augment human creativity, and redefine our interaction with\n",
      "technology.\n",
      "Despite the remarkable capabilities of proprietary LLMs\n",
      "like GPT-4 and Gemini, they are not without their shortcom-\n",
      "ings, particularly when viewed in light of the advantages\n",
      "offered by open-source models. A significant drawback is\n",
      "their limited accessibility and higher cost (OpenAI et al.,\n",
      "2023). These proprietary models often come with substantial\n",
      "usage fees and restricted access, making them ...\n",
      "\n",
      "PROCESSED TEXT:\n",
      "Tao Shen4, Reynold Cheng1, Jinyang Li1,\n",
      "Can Xu5, Dacheng Tao6, Tianyi Zhou2\n",
      "1The University of Hong Kong2University of Maryland3Microsoft\n",
      "4University of Technology Sydney5Peking University6The University of Sydney\n",
      "{shawnxxh,chongyangtao,hishentao }@gmail.com {minglii,tianyi }@umd.edu\n",
      "ckcheng@cs.hku.hk\n",
      "is meticulously structured around three foundational pillars: algorithm, skill, and verticalization – providing a comprehensive examination of knowledge distillation mechanisms, the enhancement of ...\n",
      "==========================================================================================\n",
      "\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Setting `pad_token_id` to `eos_token_id`:None for open-end generation.\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "INPUT TEXT:\n",
      "straints of accessibility, cost, and adaptability\n",
      "thus present significant challenges in leveraging the full\n",
      "potential of proprietary LLMs.\n",
      "In contrast to proprietary LLMs, open-source modelsarXiv:2402.13116v3  [cs.CL]  8 Mar 2024\n",
      "2\n",
      "like LLaMA (Touvron et al., 2023) and Mistral (Jiang et al.,\n",
      "2023a) bring several notable advantages. One of the primary\n",
      "benefits of open-source models is their accessibility and\n",
      "adaptability. Without the constraints of licensing fees or\n",
      "restrictive usage policies, t...\n",
      "\n",
      "PROCESSED TEXT:\n",
      "Tao Shen4, Reynold Cheng1, Jinyang Li1,\n",
      "Can Xu5, Dacheng Tao6, Tianyi Zhou2\n",
      "1The University of Hong Kong2University of Maryland3Microsoft\n",
      "4University of Technology Sydney5Peking University6The University of Sydney\n",
      "{shawnxxh,chongyangtao,hishentao }@gmail.com {minglii,tianyi }@umd.edu\n",
      "ckcheng@cs.hku.hk\n",
      "is meticulously structured around three foundational pillars: algorithm, skill, and verticalization – providing a comprehensive examination of knowledge distillation mechanisms, the enhancement of ...\n",
      "==========================================================================================\n",
      "\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Setting `pad_token_id` to `eos_token_id`:None for open-end generation.\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "INPUT TEXT:\n",
      "y stemming from their relatively\n",
      "limited scale and resources compared to their proprietary\n",
      "counterparts. One of the most significant limitations is\n",
      "the smaller model scale, which often results in lower per-\n",
      "formance on real-world tasks with a bunch of instruc-\n",
      "tions (Zheng et al., 2023a). These models, with fewer pa-\n",
      "rameters, may struggle to capture the depth and breadth\n",
      "of knowledge embodied in larger models like GPT-4. Ad-\n",
      "ditionally, the pre-training investment in these open-source\n",
      "models is...\n",
      "\n",
      "PROCESSED TEXT:\n",
      "Tao Shen4, Reynold Cheng1, Jinyang Li1,\n",
      "Can Xu5, Dacheng Tao6, Tianyi Zhou2\n",
      "1The University of Hong Kong2University of Maryland3Microsoft\n",
      "4University of Technology Sydney5Peking University6The University of Sydney\n",
      "{shawnxxh,chongyangtao,hishentao }@gmail.com {minglii,tianyi }@umd.edu\n",
      "ckcheng@cs.hku.hk\n",
      "is meticulously structured around three foundational pillars: algorithm, skill, and verticalization – providing a comprehensive examination of knowledge distillation mechanisms, the enhancement of ...\n",
      "==========================================================================================\n",
      "\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Setting `pad_token_id` to `eos_token_id`:None for open-end generation.\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "INPUT TEXT:\n",
      "ized applications. This\n",
      "limitation becomes particularly evident when these models\n",
      "are compared to the highly fine-tuned proprietary LLMs,\n",
      "which are often tailored to excel in a wide array of complex\n",
      "scenarios (OpenAI et al., 2023).\n",
      "Primarily, recognizing the disparities between propri-\n",
      "etary and open-source LLMs, KD techniques have surged\n",
      "as a means to bridge the performance gap between these\n",
      "models (Gou et al., 2021; Gupta and Agrawal, 2022). Knowl-\n",
      "edge distillation, in this context, involves ...\n",
      "\n",
      "PROCESSED TEXT:\n",
      "Tao Shen4, Reynold Cheng1, Jinyang Li1,\n",
      "Can Xu5, Dacheng Tao6, Tianyi Zhou2\n",
      "1The University of Hong Kong2University of Maryland3Microsoft\n",
      "4University of Technology Sydney5Peking University6The University of Sydney\n",
      "{shawnxxh,chongyangtao,hishentao }@gmail.com {minglii,tianyi }@umd.edu\n",
      "ckcheng@cs.hku.hk\n",
      "is meticulously structured around three foundational pillars: algorithm, skill, and verticalization – providing a comprehensive examination of knowledge distillation mechanisms, the enhancement of ...\n",
      "==========================================================================================\n",
      "\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Setting `pad_token_id` to `eos_token_id`:None for open-end generation.\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "INPUT TEXT:\n",
      "t al., 2021) has emerged as a\n",
      "prevalent paradigm to achieve knowledge distillation of\n",
      "LLMs, where a small seed of knowledge is used to prompt\n",
      "the LLM to generate more data with respect to a specific\n",
      "skill or domain (Taori et al., 2023). Secondly, KD still retains\n",
      "its fundamental role in compressing LLMs, making them\n",
      "more efficient without significant loss in performance. (Gu\n",
      "et al., 2024; Agarwal et al., 2024). More recently, the strategy\n",
      "of employing open-source LLMs as teachers for their own\n",
      "s...\n",
      "\n",
      "PROCESSED TEXT:\n",
      "Tao Shen4, Reynold Cheng1, Jinyang Li1,\n",
      "Can Xu5, Dacheng Tao6, Tianyi Zhou2\n",
      "1The University of Hong Kong2University of Maryland3Microsoft\n",
      "4University of Technology Sydney5Peking University6The University of Sydney\n",
      "{shawnxxh,chongyangtao,hishentao }@gmail.com {minglii,tianyi }@umd.edu\n",
      "ckcheng@cs.hku.hk\n",
      "is meticulously structured around three foundational pillars: algorithm, skill, and verticalization – providing a comprehensive examination of knowledge distillation mechanisms, the enhancement of ...\n",
      "==========================================================================================\n",
      "\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Setting `pad_token_id` to `eos_token_id`:None for open-end generation.\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "INPUT TEXT:\n",
      "\n",
      "via self-generated knowledge.\n",
      "A key aspect of the knowledge distillation is the en-\n",
      "hancement of skills such as advanced context following\n",
      "(e.g., in-context learning (Huang et al., 2022a) and in-\n",
      "struction following (Taori et al., 2023)), improved align-\n",
      "ment with user intents (e.g., human values/principles (Cui\n",
      "et al., 2023a), and thinking patterns like chain-of-thought\n",
      "(CoT) (Mukherjee et al., 2023)), and NLP task specialization\n",
      "(e.g., semantic understanding (Ding et al., 2023a), and code\n",
      "gen...\n",
      "\n",
      "PROCESSED TEXT:\n",
      "Tao Shen4, Reynold Cheng1, Jinyang Li1,\n",
      "Can Xu5, Dacheng Tao6, Tianyi Zhou2\n",
      "1The University of Hong Kong2University of Maryland3Microsoft\n",
      "4University of Technology Sydney5Peking University6The University of Sydney\n",
      "{shawnxxh,chongyangtao,hishentao }@gmail.com {minglii,tianyi }@umd.edu\n",
      "ckcheng@cs.hku.hk\n",
      "is meticulously structured around three foundational pillars: algorithm, skill, and verticalization – providing a comprehensive examination of knowledge distillation mechanisms, the enhancement of ...\n",
      "==========================================================================================\n",
      "\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Setting `pad_token_id` to `eos_token_id`:None for open-end generation.\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "INPUT TEXT:\n",
      "rom the\n",
      "proprietary models that have been extensively trained and\n",
      "fine-tuned in these areas.\n",
      "The benefits of knowledge distillation in the era of\n",
      "LLMs are multifaceted and transformative (Gu et al., 2024).\n",
      "Through a suite of distillation techniques, the gap between\n",
      "proprietary and open-source models is significantly nar-\n",
      "rowed (Chiang et al., 2023; Xu et al., 2023a) and even\n",
      "filled (Zhao et al., 2023a). This process not only streamlines\n",
      "computational requirements but also enhances the environ-\n",
      "m...\n",
      "\n",
      "PROCESSED TEXT:\n",
      "Tao Shen4, Reynold Cheng1, Jinyang Li1,\n",
      "Can Xu5, Dacheng Tao6, Tianyi Zhou2\n",
      "1The University of Hong Kong2University of Maryland3Microsoft\n",
      "4University of Technology Sydney5Peking University6The University of Sydney\n",
      "{shawnxxh,chongyangtao,hishentao }@gmail.com {minglii,tianyi }@umd.edu\n",
      "ckcheng@cs.hku.hk\n",
      "is meticulously structured around three foundational pillars: algorithm, skill, and verticalization – providing a comprehensive examination of knowledge distillation mechanisms, the enhancement of ...\n",
      "==========================================================================================\n",
      "\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Setting `pad_token_id` to `eos_token_id`:None for open-end generation.\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "INPUT TEXT:\n",
      " growth across various industries\n",
      "and research domains.\n",
      "The escalating need for a comprehensive survey on the\n",
      "knowledge distillation of LLMs stems from the rapidly\n",
      "evolving landscape of AI (OpenAI et al., 2023; Team et al.,\n",
      "2023) and the increasing complexity of these models. As AI\n",
      "continues to penetrate various sectors, the ability to effi-\n",
      "ciently and effectively distill knowledge from proprietary\n",
      "LLMs to open-source ones becomes not just a technical\n",
      "aspiration but a practical necessity. This ...\n",
      "\n",
      "PROCESSED TEXT:\n",
      "Tao Shen4, Reynold Cheng1, Jinyang Li1,\n",
      "Can Xu5, Dacheng Tao6, Tianyi Zhou2\n",
      "1The University of Hong Kong2University of Maryland3Microsoft\n",
      "4University of Technology Sydney5Peking University6The University of Sydney\n",
      "{shawnxxh,chongyangtao,hishentao }@gmail.com {minglii,tianyi }@umd.edu\n",
      "ckcheng@cs.hku.hk\n",
      "is meticulously structured around three foundational pillars: algorithm, skill, and verticalization – providing a comprehensive examination of knowledge distillation mechanisms, the enhancement of ...\n",
      "==========================================================================================\n",
      "\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Setting `pad_token_id` to `eos_token_id`:None for open-end generation.\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "INPUT TEXT:\n",
      "eRankOptimizationy,1y,2y3y1y2y3≻≻rank……\n",
      "DataCuration\n",
      "X,YrawdatasynthesizefeedbackFeedback\n",
      "input\n",
      "outputSelf-Knowledge\n",
      "outputinputinput\n",
      "YlabelLabelingExpansion\n",
      "X,YdemonstrationsexpandFeature\n",
      "featureinput,outputextractSec.4Sec.5\n",
      "Sec.3.1Sec.3.2\n",
      "Fig. 2: An overview of this survey on knowledge distillation of large language models. Note that ‘Section’ is abbreviated\n",
      "as ‘Sec.’ in this figure. RM S(·)denotes the student reward model.\n",
      "the growing demand for more accessible, cost-effective, and\n",
      "adaptable ...\n",
      "\n",
      "PROCESSED TEXT:\n",
      "Tao Shen4, Reynold Cheng1, Jinyang Li1,\n",
      "Can Xu5, Dacheng Tao6, Tianyi Zhou2\n",
      "1The University of Hong Kong2University of Maryland3Microsoft\n",
      "4University of Technology Sydney5Peking University6The University of Sydney\n",
      "{shawnxxh,chongyangtao,hishentao }@gmail.com {minglii,tianyi }@umd.edu\n",
      "ckcheng@cs.hku.hk\n",
      "is meticulously structured around three foundational pillars: algorithm, skill, and verticalization – providing a comprehensive examination of knowledge distillation mechanisms, the enhancement of ...\n",
      "==========================================================================================\n",
      "\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Setting `pad_token_id` to `eos_token_id`:None for open-end generation.\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "INPUT TEXT:\n",
      "posing direc-\n",
      "tions for future research.\n",
      "Survey Organization. The remainder of this survey is orga-\n",
      "nized into several comprehensive sections, each designed to\n",
      "offer a deep dive into the multifaceted aspects of knowledge\n",
      "distillation within the realm ofLLMs. Following this intro-\n",
      "duction, §2 provides a foundational overview of knowledge\n",
      "distillation, comparing traditional techniques with those\n",
      "emerging in the era of LLMs and highlighting the role of\n",
      "data augmentation (DA) in this context. §3 del...\n",
      "\n",
      "PROCESSED TEXT:\n",
      "Tao Shen4, Reynold Cheng1, Jinyang Li1,\n",
      "Can Xu5, Dacheng Tao6, Tianyi Zhou2\n",
      "1The University of Hong Kong2University of Maryland3Microsoft\n",
      "4University of Technology Sydney5Peking University6The University of Sydney\n",
      "{shawnxxh,chongyangtao,hishentao }@gmail.com {minglii,tianyi }@umd.edu\n",
      "ckcheng@cs.hku.hk\n",
      "is meticulously structured around three foundational pillars: algorithm, skill, and verticalization – providing a comprehensive examination of knowledge distillation mechanisms, the enhancement of ...\n",
      "==========================================================================================\n",
      "\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Setting `pad_token_id` to `eos_token_id`:None for open-end generation.\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "INPUT TEXT:\n",
      "guage understanding (NLU), genera-\n",
      "tion (NLG), information retrieval, recommendation systems,\n",
      "and the evaluation of text generation. In §5, we ventureinto domain-specific vertical distillation, showcasing how\n",
      "knowledge distillation techniques are applied within spe-\n",
      "cialized fields such as law, healthcare, finance, and science,\n",
      "illustrating the practical implications and transformative\n",
      "impact of these approaches. The survey suggests open\n",
      "problems in §6, identifying current challenges and gaps in...\n",
      "\n",
      "PROCESSED TEXT:\n",
      "Tao Shen4, Reynold Cheng1, Jinyang Li1,\n",
      "Can Xu5, Dacheng Tao6, Tianyi Zhou2\n",
      "1The University of Hong Kong2University of Maryland3Microsoft\n",
      "4University of Technology Sydney5Peking University6The University of Sydney\n",
      "{shawnxxh,chongyangtao,hishentao }@gmail.com {minglii,tianyi }@umd.edu\n",
      "ckcheng@cs.hku.hk\n",
      "is meticulously structured around three foundational pillars: algorithm, skill, and verticalization – providing a comprehensive examination of knowledge distillation mechanisms, the enhancement of ...\n",
      "==========================================================================================\n",
      "\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Setting `pad_token_id` to `eos_token_id`:None for open-end generation.\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "INPUT TEXT:\n",
      "large, complex model (teacher) to a\n",
      "smaller, more efficient model (student) (Gou et al., 2021).\n",
      "This technique is pivotal in mitigating the challenges posed\n",
      "by the computational demands and resource constraints of\n",
      "deploying large-scale models in practical applications.\n",
      "Historically, knowledge distillation techniques, prior to\n",
      "the era of LLMs, primarily concentrated on transferring\n",
      "knowledge from complex, often cumbersome neural net-\n",
      "works to more compact and efficient architectures (Sanh\n",
      "et al.,...\n",
      "\n",
      "PROCESSED TEXT:\n",
      "Tao Shen4, Reynold Cheng1, Jinyang Li1,\n",
      "Can Xu5, Dacheng Tao6, Tianyi Zhou2\n",
      "1The University of Hong Kong2University of Maryland3Microsoft\n",
      "4University of Technology Sydney5Peking University6The University of Sydney\n",
      "{shawnxxh,chongyangtao,hishentao }@gmail.com {minglii,tianyi }@umd.edu\n",
      "ckcheng@cs.hku.hk\n",
      "is meticulously structured around three foundational pillars: algorithm, skill, and verticalization – providing a comprehensive examination of knowledge distillation mechanisms, the enhancement of ...\n",
      "==========================================================================================\n",
      "\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Setting `pad_token_id` to `eos_token_id`:None for open-end generation.\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "INPUT TEXT:\n",
      " (Chenglin et al., 2023)\n",
      "ExpansionSelf-Instruct (Wang et al., 2022a), Alpaca (Taori et al., 2023), Code Alpaca (Chaudhary, 2023)\n",
      "Self-Align (Sun et al., 2024b), WizardLM (Xu et al., 2023a), WizardCoder (Luo et al., 2023a),\n",
      "WizardMath (Luo et al., 2023b), AugGPT (Dai et al., 2023a), TDG (He et al., 2023b)\n",
      "CurationUltraChat (Ding et al., 2023b), Phi-1 (Gunasekar et al., 2023), Phi-1.5 (Li et al., 2023a),\n",
      "Phi-2 (Mar, 2023), Magicoder (Wei et al., 2023), WaveCoder (Yu et al., 2024)\n",
      "ZeroGen (Ye et al...\n",
      "\n",
      "PROCESSED TEXT:\n",
      "Tao Shen4, Reynold Cheng1, Jinyang Li1,\n",
      "Can Xu5, Dacheng Tao6, Tianyi Zhou2\n",
      "1The University of Hong Kong2University of Maryland3Microsoft\n",
      "4University of Technology Sydney5Peking University6The University of Sydney\n",
      "{shawnxxh,chongyangtao,hishentao }@gmail.com {minglii,tianyi }@umd.edu\n",
      "ckcheng@cs.hku.hk\n",
      "is meticulously structured around three foundational pillars: algorithm, skill, and verticalization – providing a comprehensive examination of knowledge distillation mechanisms, the enhancement of ...\n",
      "==========================================================================================\n",
      "\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Setting `pad_token_id` to `eos_token_id`:None for open-end generation.\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "INPUT TEXT:\n",
      "2024)\n",
      "Self-KnowledgeSelf-Instruct (Wang et al., 2022a), Self-Align (Sun et al., 2024b), RLCD (Yang et al., 2024a),\n",
      "ImpDistill (Jung et al., 2023), LMSI (Huang et al., 2023a), ReST (Gulcehre et al., 2023),\n",
      "Self-Rewarding (Yuan et al., 2024a), Baize (Xu et al., 2023b), STaR (Zelikman et al., 2022)\n",
      "DistillationSupervised Fine-TuningAlpaca (Taori et al., 2023), Vicuna (Chiang et al., 2023), WizardLM (Xu et al., 2023a),\n",
      "Self-Instruct (Wang et al., 2022a), Baize (Xu et al., 2023b), STaR (Zelikman et a...\n",
      "\n",
      "PROCESSED TEXT:\n",
      "Tao Shen4, Reynold Cheng1, Jinyang Li1,\n",
      "Can Xu5, Dacheng Tao6, Tianyi Zhou2\n",
      "1The University of Hong Kong2University of Maryland3Microsoft\n",
      "4University of Technology Sydney5Peking University6The University of Sydney\n",
      "{shawnxxh,chongyangtao,hishentao }@gmail.com {minglii,tianyi }@umd.edu\n",
      "ckcheng@cs.hku.hk\n",
      "is meticulously structured around three foundational pillars: algorithm, skill, and verticalization – providing a comprehensive examination of knowledge distillation mechanisms, the enhancement of ...\n",
      "==========================================================================================\n",
      "\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Setting `pad_token_id` to `eos_token_id`:None for open-end generation.\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "INPUT TEXT:\n",
      "kill\n",
      "DistillationContext FollowingInstruction FollowingSelf-Instruct (Wang et al., 2022a), Alpaca (Taori et al., 2023), Vicuna (Chiang et al., 2023),\n",
      "WizardLM (Xu et al., 2023a), Orca (Mukherjee et al., 2023), Orca 2 (Mitra et al., 2023),\n",
      "WizardMath (Luo et al., 2023b), Llama-GPT4 (Peng et al., 2023a),\n",
      "Multi-turn DialogueVicuna (Chiang et al., 2023), Baize (Xu et al., 2023b), UltraLLaMA (Ding et al., 2023b),\n",
      "CAMEL (Li et al., 2023b), OpenChat (Wang et al., 2023c), Zephyr (Tunstall et al., 2023),...\n",
      "\n",
      "PROCESSED TEXT:\n",
      "Tao Shen4, Reynold Cheng1, Jinyang Li1,\n",
      "Can Xu5, Dacheng Tao6, Tianyi Zhou2\n",
      "1The University of Hong Kong2University of Maryland3Microsoft\n",
      "4University of Technology Sydney5Peking University6The University of Sydney\n",
      "{shawnxxh,chongyangtao,hishentao }@gmail.com {minglii,tianyi }@umd.edu\n",
      "ckcheng@cs.hku.hk\n",
      "is meticulously structured around three foundational pillars: algorithm, skill, and verticalization – providing a comprehensive examination of knowledge distillation mechanisms, the enhancement of ...\n",
      "==========================================================================================\n",
      "\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Setting `pad_token_id` to `eos_token_id`:None for open-end generation.\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "INPUT TEXT:\n",
      "2023), UltraFeedback (Cui et al., 2023a),\n",
      "ValueCAI (Bai et al., 2022a), Align Honesty (Yang et al., 2023a), SANDBOX (Liu et al., 2023b),\n",
      "Self-Align (Sun et al., 2024b), UltraFeedback (Cui et al., 2023a), RLCD (Yang et al., 2024a)\n",
      "AgentTool UsingToolformer (Schick et al., 2023), Graph-ToolFormer (Zhang, 2023), Gorilla (Patil et al., 2023),\n",
      "ToolAlpaca (Tang et al., 2023a), ToolLLM (Qin et al., 2023a), CRAFT (Yuan et al., 2023a),\n",
      "Confucius (Gao et al., 2023b), MLLM-Tool (Wang et al., 2024), α-UMi (...\n",
      "\n",
      "PROCESSED TEXT:\n",
      "Tao Shen4, Reynold Cheng1, Jinyang Li1,\n",
      "Can Xu5, Dacheng Tao6, Tianyi Zhou2\n",
      "1The University of Hong Kong2University of Maryland3Microsoft\n",
      "4University of Technology Sydney5Peking University6The University of Sydney\n",
      "{shawnxxh,chongyangtao,hishentao }@gmail.com {minglii,tianyi }@umd.edu\n",
      "ckcheng@cs.hku.hk\n",
      "is meticulously structured around three foundational pillars: algorithm, skill, and verticalization – providing a comprehensive examination of knowledge distillation mechanisms, the enhancement of ...\n",
      "==========================================================================================\n",
      "\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Setting `pad_token_id` to `eos_token_id`:None for open-end generation.\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "INPUT TEXT:\n",
      "OMP (Xu et al., 2024b), MaRio (Ramnath et al., 2023),\n",
      "ID (Jung et al., 2023), GPT-3 Labeling (Wang et al., 2021b), BioGPT (Guo et al., 2023a),\n",
      "ChatGPT NMT (Yang and Nicolai, 2023),\n",
      "Information RetrievalQUILL (Srinivasan et al., 2022), Promptgator (Dai et al., 2023b), InPars (Bonifacio et al., 2022),\n",
      "AugTriever (Meng et al., 2023), (Sun et al., 2023a), RankVicuna (Pradeep et al., 2023a),\n",
      "RankZephyr (Pradeep et al., 2023b), ExaRanker (Ferraretto et al., 2023),\n",
      "Recommendation NDR (Mysore et al., 20...\n",
      "\n",
      "PROCESSED TEXT:\n",
      "Tao Shen4, Reynold Cheng1, Jinyang Li1,\n",
      "Can Xu5, Dacheng Tao6, Tianyi Zhou2\n",
      "1The University of Hong Kong2University of Maryland3Microsoft\n",
      "4University of Technology Sydney5Peking University6The University of Sydney\n",
      "{shawnxxh,chongyangtao,hishentao }@gmail.com {minglii,tianyi }@umd.edu\n",
      "ckcheng@cs.hku.hk\n",
      "is meticulously structured around three foundational pillars: algorithm, skill, and verticalization – providing a comprehensive examination of knowledge distillation mechanisms, the enhancement of ...\n",
      "==========================================================================================\n",
      "\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Setting `pad_token_id` to `eos_token_id`:None for open-end generation.\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "INPUT TEXT:\n",
      "lti-ModalityLLaVA (Liu et al., 2023e), SVIT (Zhao et al., 2023b), LVIS-Instruct4V (Wang et al., 2023e), Shikra (Chen et al., 2023c),\n",
      "LSKD (Park et al., 2023), DetGPT (Pi et al., 2023; Zhao et al., 2023c), LRV (Liu et al., 2023f), NExT-GPT (Wu et al., 2023b),\n",
      "Valley (Luo et al., 2023d), ILuvUI (Jiang et al., 2023d), StableLLaVA (Li et al., 2023c), PointLLM (Xu et al., 2023e),\n",
      "Verticalization\n",
      "DistillationLaw (Huang et al., 2023b; Cui et al., 2023b); Medical & Healthcare (Zhang et al., 2023c; Chen ...\n",
      "\n",
      "PROCESSED TEXT:\n",
      "Tao Shen4, Reynold Cheng1, Jinyang Li1,\n",
      "Can Xu5, Dacheng Tao6, Tianyi Zhou2\n",
      "1The University of Hong Kong2University of Maryland3Microsoft\n",
      "4University of Technology Sydney5Peking University6The University of Sydney\n",
      "{shawnxxh,chongyangtao,hishentao }@gmail.com {minglii,tianyi }@umd.edu\n",
      "ckcheng@cs.hku.hk\n",
      "is meticulously structured around three foundational pillars: algorithm, skill, and verticalization – providing a comprehensive examination of knowledge distillation mechanisms, the enhancement of ...\n",
      "==========================================================================================\n",
      "\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Setting `pad_token_id` to `eos_token_id`:None for open-end generation.\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "INPUT TEXT:\n",
      "network to mimic the\n",
      "output of a larger teacher network, often through techniques\n",
      "like soft target training, where the student learns from\n",
      "the softened softmax output of the teacher. Please refer to\n",
      "the survey (Gou et al., 2021) for more details on general\n",
      "knowledge distillation techniques in AI and DL.\n",
      "In contrast, the advent of LLMs has revolutionized\n",
      "the knowledge distillation landscape. The current era of\n",
      "knowledge distillation in LLMs shifts the focus from mere\n",
      "architecture compression to t...\n",
      "\n",
      "PROCESSED TEXT:\n",
      "Tao Shen4, Reynold Cheng1, Jinyang Li1,\n",
      "Can Xu5, Dacheng Tao6, Tianyi Zhou2\n",
      "1The University of Hong Kong2University of Maryland3Microsoft\n",
      "4University of Technology Sydney5Peking University6The University of Sydney\n",
      "{shawnxxh,chongyangtao,hishentao }@gmail.com {minglii,tianyi }@umd.edu\n",
      "ckcheng@cs.hku.hk\n",
      "is meticulously structured around three foundational pillars: algorithm, skill, and verticalization – providing a comprehensive examination of knowledge distillation mechanisms, the enhancement of ...\n",
      "==========================================================================================\n",
      "\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Setting `pad_token_id` to `eos_token_id`:None for open-end generation.\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "INPUT TEXT:\n",
      "r reduce the model size , the current focus in LLM-based\n",
      "knowledge distillation is to extract and transfer the rich,\n",
      "nuanced understanding that these models have developed.\n",
      "The key to this modern approach lies in heuristic and\n",
      "carefully designed prompts, which are used to elicit specific\n",
      "knowledge (Ding et al., 2023b) or capabilities (Chaudhary,\n",
      "2023) from the LLMs. These prompts are crafted to tap\n",
      "into the LLM’s understanding and capabilities in various\n",
      "domains, ranging from natural language un...\n",
      "\n",
      "PROCESSED TEXT:\n",
      "Tao Shen4, Reynold Cheng1, Jinyang Li1,\n",
      "Can Xu5, Dacheng Tao6, Tianyi Zhou2\n",
      "1The University of Hong Kong2University of Maryland3Microsoft\n",
      "4University of Technology Sydney5Peking University6The University of Sydney\n",
      "{shawnxxh,chongyangtao,hishentao }@gmail.com {minglii,tianyi }@umd.edu\n",
      "ckcheng@cs.hku.hk\n",
      "is meticulously structured around three foundational pillars: algorithm, skill, and verticalization – providing a comprehensive examination of knowledge distillation mechanisms, the enhancement of ...\n",
      "==========================================================================================\n",
      "\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Setting `pad_token_id` to `eos_token_id`:None for open-end generation.\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "INPUT TEXT:\n",
      "their explicit training objectives.\n",
      "Furthermore, this era of knowledge distillation also em-\n",
      "phasizes the transfer of more abstract qualities such as\n",
      "reasoning patterns (Mitra et al., 2023), preference align-\n",
      "ment (Cui et al., 2023a), and value alignment (Sun et al.,\n",
      "2024b). This is in stark contrast to the earlier focus on output\n",
      "replication (Taori et al., 2023), indicating a shift towards\n",
      "a more holistic and comprehensive transfer of cognitive\n",
      "capabilities. The current techniques involve not j...\n",
      "\n",
      "PROCESSED TEXT:\n",
      "Tao Shen4, Reynold Cheng1, Jinyang Li1,\n",
      "Can Xu5, Dacheng Tao6, Tianyi Zhou2\n",
      "1The University of Hong Kong2University of Maryland3Microsoft\n",
      "4University of Technology Sydney5Peking University6The University of Sydney\n",
      "{shawnxxh,chongyangtao,hishentao }@gmail.com {minglii,tianyi }@umd.edu\n",
      "ckcheng@cs.hku.hk\n",
      "is meticulously structured around three foundational pillars: algorithm, skill, and verticalization – providing a comprehensive examination of knowledge distillation mechanisms, the enhancement of ...\n",
      "==========================================================================================\n",
      "\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Setting `pad_token_id` to `eos_token_id`:None for open-end generation.\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "INPUT TEXT:\n",
      " al., 2022) emerges as a critical paradigm integral\n",
      "to the process of knowledge distillation. Unlike traditional\n",
      "DA techniques such as paraphrasing (Gangal et al., 2022) orback-translation (Longpre et al., 2019), which primarily aim\n",
      "at expanding the training dataset in a somewhat mechanical\n",
      "manner. DA within the context of LLMs focuses on the\n",
      "generation of novel, context-rich training data tailored to\n",
      "specific domains and skills. This innovation is driven by the\n",
      "unique capabilities of LLMs to ge...\n",
      "\n",
      "PROCESSED TEXT:\n",
      "Tao Shen4, Reynold Cheng1, Jinyang Li1,\n",
      "Can Xu5, Dacheng Tao6, Tianyi Zhou2\n",
      "1The University of Hong Kong2University of Maryland3Microsoft\n",
      "4University of Technology Sydney5Peking University6The University of Sydney\n",
      "{shawnxxh,chongyangtao,hishentao }@gmail.com {minglii,tianyi }@umd.edu\n",
      "ckcheng@cs.hku.hk\n",
      "is meticulously structured around three foundational pillars: algorithm, skill, and verticalization – providing a comprehensive examination of knowledge distillation mechanisms, the enhancement of ...\n",
      "==========================================================================================\n",
      "\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Setting `pad_token_id` to `eos_token_id`:None for open-end generation.\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "INPUT TEXT:\n",
      "capability gap between proprietary and open-\n",
      "source models. Through DA, LLMs are prompted to create\n",
      "targeted, high-quality datasets that are not merely larger in\n",
      "volume but are also rich in diversity and specificity. This\n",
      "approach enables the distillation process to be more effec-\n",
      "tive, ensuring that the distilled models not only replicate\n",
      "the teacher model’s output behavior but also embody its\n",
      "deep-seated understanding and cognitive strategies.\n",
      "The significance and necessity of DA for achieving...\n",
      "\n",
      "PROCESSED TEXT:\n",
      "Tao Shen4, Reynold Cheng1, Jinyang Li1,\n",
      "Can Xu5, Dacheng Tao6, Tianyi Zhou2\n",
      "1The University of Hong Kong2University of Maryland3Microsoft\n",
      "4University of Technology Sydney5Peking University6The University of Sydney\n",
      "{shawnxxh,chongyangtao,hishentao }@gmail.com {minglii,tianyi }@umd.edu\n",
      "ckcheng@cs.hku.hk\n",
      "is meticulously structured around three foundational pillars: algorithm, skill, and verticalization – providing a comprehensive examination of knowledge distillation mechanisms, the enhancement of ...\n",
      "==========================================================================================\n",
      "\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Setting `pad_token_id` to `eos_token_id`:None for open-end generation.\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "INPUT TEXT:\n",
      "ssible approach to harnessing\n",
      "the power of LLMs. It empowers open-source models with\n",
      "the ability to approximate the contextual adeptness, ethical\n",
      "alignment, and deep semantic insights characteristic of their\n",
      "proprietary counterparts, thereby democratizing access to\n",
      "advanced AI capabilities and fostering innovation across a\n",
      "broader spectrum of applications and users.\n",
      "2.3 Survey Scope\n",
      "Building on the discussions introduced earlier, this survey\n",
      "aims to comprehensively explore the landscape of knowl...\n",
      "\n",
      "PROCESSED TEXT:\n",
      "Tao Shen4, Reynold Cheng1, Jinyang Li1,\n",
      "Can Xu5, Dacheng Tao6, Tianyi Zhou2\n",
      "1The University of Hong Kong2University of Maryland3Microsoft\n",
      "4University of Technology Sydney5Peking University6The University of Sydney\n",
      "{shawnxxh,chongyangtao,hishentao }@gmail.com {minglii,tianyi }@umd.edu\n",
      "ckcheng@cs.hku.hk\n",
      "is meticulously structured around three foundational pillars: algorithm, skill, and verticalization – providing a comprehensive examination of knowledge distillation mechanisms, the enhancement of ...\n",
      "==========================================================================================\n",
      "\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Setting `pad_token_id` to `eos_token_id`:None for open-end generation.\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "INPUT TEXT:\n",
      "ions and methodologies of knowledge distillation. It\n",
      "includes an in-depth exploration of the processes involved\n",
      "in constructing knowledge from teacher models (e.g., pro-\n",
      "prietary LLMs) and integrating this knowledge into student\n",
      "models (e.g., open-source LLMs). Under the umbrella of\n",
      "‘knowledge ’, we delve into strategies such as labeling (Hsieh\n",
      "et al., 2023), expansion (Taori et al., 2023), curation (Gu-\n",
      "nasekar et al., 2023), feature understanding (Agarwal et al.,\n",
      "6\n",
      "2024), feedback mechanisms (...\n",
      "\n",
      "PROCESSED TEXT:\n",
      "Tao Shen4, Reynold Cheng1, Jinyang Li1,\n",
      "Can Xu5, Dacheng Tao6, Tianyi Zhou2\n",
      "1The University of Hong Kong2University of Maryland3Microsoft\n",
      "4University of Technology Sydney5Peking University6The University of Sydney\n",
      "{shawnxxh,chongyangtao,hishentao }@gmail.com {minglii,tianyi }@umd.edu\n",
      "ckcheng@cs.hku.hk\n",
      "is meticulously structured around three foundational pillars: algorithm, skill, and verticalization – providing a comprehensive examination of knowledge distillation mechanisms, the enhancement of ...\n",
      "==========================================================================================\n",
      "\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Setting `pad_token_id` to `eos_token_id`:None for open-end generation.\n",
      "Setting `pad_token_id` to `eos_token_id`:None for open-end generation.\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "INPUT TEXT:\n",
      "This analysis\n",
      "aims to illuminate how these algorithms facilitate the trans-\n",
      "fer of knowledge, ensuring that open-source models can\n",
      "replicate and, in some cases, surpass the capabilities of their\n",
      "proprietary counterparts.\n",
      "Skill Distillation. This facet examines the specific compe-\n",
      "tencies and capabilities enhanced through KD. It encom-\n",
      "passes detailed discussions on context following (Taori et al.,\n",
      "2023; Luo et al., 2023c), with subtopics like instruction\n",
      "following and retrieval-augmented generat...\n",
      "\n",
      "PROCESSED TEXT:\n",
      "Tao Shen4, Reynold Cheng1, Jinyang Li1,\n",
      "Can Xu5, Dacheng Tao6, Tianyi Zhou2\n",
      "1The University of Hong Kong2University of Maryland3Microsoft\n",
      "4University of Technology Sydney5Peking University6The University of Sydney\n",
      "{shawnxxh,chongyangtao,hishentao }@gmail.com {minglii,tianyi }@umd.edu\n",
      "ckcheng@cs.hku.hk\n",
      "is meticulously structured around three foundational pillars: algorithm, skill, and verticalization – providing a comprehensive examination of knowledge distillation mechanisms, the enhancement of ...\n",
      "==========================================================================================\n",
      "\n",
      "INPUT TEXT:\n",
      ", text generation evaluation, and code gen-\n",
      "eration. Finally, the survey addresses multi-modality (Liu\n",
      "et al., 2023e; Zhao et al., 2023b), exploring how KD enhances\n",
      "LLMs’ ability to interpret and integrate multiple forms of\n",
      "input, enriching their utility and applicability across various\n",
      "contexts.\n",
      "Verticalization Distillation. This section assesses the ap-\n",
      "plication of KD across diverse vertical domains, offering\n",
      "insights into how distilled LLMs can be tailored for spe-\n",
      "cialized fields such as La...\n",
      "\n",
      "PROCESSED TEXT:\n",
      "Tao Shen4, Reynold Cheng1, Jinyang Li1,\n",
      "Can Xu5, Dacheng Tao6, Tianyi Zhou2\n",
      "1The University of Hong Kong2University of Maryland3Microsoft\n",
      "4University of Technology Sydney5Peking University6The University of Sydney\n",
      "{shawnxxh,chongyangtao,hishentao }@gmail.com {minglii,tianyi }@umd.edu\n",
      "ckcheng@cs.hku.hk\n",
      "is meticulously structured around three foundational pillars: algorithm, skill, and verticalization – providing a comprehensive examination of knowledge distillation mechanisms, the enhancement of ...\n",
      "==========================================================================================\n",
      "\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Setting `pad_token_id` to `eos_token_id`:None for open-end generation.\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "INPUT TEXT:\n",
      "roader AI and ML ecosystem.\n",
      "By navigating through these facets, this survey en-\n",
      "deavors to provide an extensive and nuanced analysis of\n",
      "knowledge distillation in the era of LLMs. It serves as a\n",
      "guide for researchers, practitioners, and enthusiasts in the\n",
      "field, shedding light on current methodologies, challenges,\n",
      "and opportunities for innovation in this rapidly evolving\n",
      "domain.\n",
      "Declaration. This survey represents our earnest effort to\n",
      "provide a comprehensive and insightful overview of knowl-\n",
      "edg...\n",
      "\n",
      "PROCESSED TEXT:\n",
      "Tao Shen4, Reynold Cheng1, Jinyang Li1,\n",
      "Can Xu5, Dacheng Tao6, Tianyi Zhou2\n",
      "1The University of Hong Kong2University of Maryland3Microsoft\n",
      "4University of Technology Sydney5Peking University6The University of Sydney\n",
      "{shawnxxh,chongyangtao,hishentao }@gmail.com {minglii,tianyi }@umd.edu\n",
      "ckcheng@cs.hku.hk\n",
      "is meticulously structured around three foundational pillars: algorithm, skill, and verticalization – providing a comprehensive examination of knowledge distillation mechanisms, the enhancement of ...\n",
      "==========================================================================================\n",
      "\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Setting `pad_token_id` to `eos_token_id`:None for open-end generation.\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "INPUT TEXT:\n",
      " their impacts\n",
      "across a range of applications.\n",
      "2.4 Distillation Pipeline in LLM Era\n",
      "SeedKnowledgeSkill/Domain\n",
      "TeacherLLMKnowledgeElicitationStudentModelDistillationAlgorithmsteer\n",
      "driveGeneratedKnowledgeLearningObjectivetrain\n",
      "Fig. 4: An illustration of a general pipeline to distill knowl-\n",
      "edge from a large language model to a student model.\n",
      "The general distillation pipeline of LLMs is a structured\n",
      "and methodical process aimed at transferring knowledge\n",
      "from a sophisticated teacher model to a less ...\n",
      "\n",
      "PROCESSED TEXT:\n",
      "Tao Shen4, Reynold Cheng1, Jinyang Li1,\n",
      "Can Xu5, Dacheng Tao6, Tianyi Zhou2\n",
      "1The University of Hong Kong2University of Maryland3Microsoft\n",
      "4University of Technology Sydney5Peking University6The University of Sydney\n",
      "{shawnxxh,chongyangtao,hishentao }@gmail.com {minglii,tianyi }@umd.edu\n",
      "ckcheng@cs.hku.hk\n",
      "is meticulously structured around three foundational pillars: algorithm, skill, and verticalization – providing a comprehensive examination of knowledge distillation mechanisms, the enhancement of ...\n",
      "==========================================================================================\n",
      "\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Setting `pad_token_id` to `eos_token_id`:None for open-end generation.\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "INPUT TEXT:\n",
      "lves directing the teacher LLM towards a\n",
      "specific target skill or domain. This is achieved through care-\n",
      "fully crafted instructions or templates that guide the LLM’s\n",
      "focus. These instructions are designed to elicit responses\n",
      "that demonstrate the LLM’s proficiency in a particular area,\n",
      "be it a specialized domain like healthcare or law, or a skill\n",
      "such as reasoning or language understanding. The objective\n",
      "here is to utilize the teacher LLM’s extensive training and\n",
      "nuanced capabilities to generate ...\n",
      "\n",
      "PROCESSED TEXT:\n",
      "Tao Shen4, Reynold Cheng1, Jinyang Li1,\n",
      "Can Xu5, Dacheng Tao6, Tianyi Zhou2\n",
      "1The University of Hong Kong2University of Maryland3Microsoft\n",
      "4University of Technology Sydney5Peking University6The University of Sydney\n",
      "{shawnxxh,chongyangtao,hishentao }@gmail.com {minglii,tianyi }@umd.edu\n",
      "ckcheng@cs.hku.hk\n",
      "is meticulously structured around three foundational pillars: algorithm, skill, and verticalization – providing a comprehensive examination of knowledge distillation mechanisms, the enhancement of ...\n",
      "==========================================================================================\n",
      "\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Setting `pad_token_id` to `eos_token_id`:None for open-end generation.\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "INPUT TEXT:\n",
      " seed knowledge is crucial as it provides a\n",
      "foundation upon which the teacher model can build and\n",
      "expand, thereby creating more comprehensive and in-depth\n",
      "knowledge examples.\n",
      "III. Generation of Distillation Knowledge. In response\n",
      "to the seed knowledge and steering instructions, the teacher\n",
      "LLM generates knowledge examples. These examples are\n",
      "predominantly in the form of question-and-answer (QA)\n",
      "dialogues or narrative explanations, aligning with the nat-\n",
      "ural language processing/understanding cap...\n",
      "\n",
      "PROCESSED TEXT:\n",
      "Tao Shen4, Reynold Cheng1, Jinyang Li1,\n",
      "Can Xu5, Dacheng Tao6, Tianyi Zhou2\n",
      "1The University of Hong Kong2University of Maryland3Microsoft\n",
      "4University of Technology Sydney5Peking University6The University of Sydney\n",
      "{shawnxxh,chongyangtao,hishentao }@gmail.com {minglii,tianyi }@umd.edu\n",
      "ckcheng@cs.hku.hk\n",
      "is meticulously structured around three foundational pillars: algorithm, skill, and verticalization – providing a comprehensive examination of knowledge distillation mechanisms, the enhancement of ...\n",
      "==========================================================================================\n",
      "\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Setting `pad_token_id` to `eos_token_id`:None for open-end generation.\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "INPUT TEXT:\n",
      "ge examples to train the student\n",
      "model. This training is guided by a loss function that aligns\n",
      "with the learning objectives. The loss function quantifies\n",
      "the student model’s performance in replicating or adapting\n",
      "the knowledge from the teacher model. By minimizing this\n",
      "loss, the student model learns to emulate the target skills or\n",
      "domain knowledge of the teacher, thereby acquiring similar\n",
      "capabilities. The process involves iteratively adjusting the\n",
      "student model’s parameters to reduce the discre...\n",
      "\n",
      "PROCESSED TEXT:\n",
      "Tao Shen4, Reynold Cheng1, Jinyang Li1,\n",
      "Can Xu5, Dacheng Tao6, Tianyi Zhou2\n",
      "1The University of Hong Kong2University of Maryland3Microsoft\n",
      "4University of Technology Sydney5Peking University6The University of Sydney\n",
      "{shawnxxh,chongyangtao,hishentao }@gmail.com {minglii,tianyi }@umd.edu\n",
      "ckcheng@cs.hku.hk\n",
      "is meticulously structured around three foundational pillars: algorithm, skill, and verticalization – providing a comprehensive examination of knowledge distillation mechanisms, the enhancement of ...\n",
      "==========================================================================================\n",
      "\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Setting `pad_token_id` to `eos_token_id`:None for open-end generation.\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "INPUT TEXT:\n",
      "ch the LLM can\n",
      "explore to generate novel knowledge, Parse( o, s)stands for\n",
      "to parse the distillation example ( e.g., (x, y)) from the\n",
      "teacher LLM’s output o(plus the input sin some cases),\n",
      "andpTrepresents the teacher LLM with parameters θT.\n",
      "Given the datasets D(kd)\n",
      "Ibuilt for distillation, we then define\n",
      "a learning objective as\n",
      "L=X\n",
      "ILI(D(kd)\n",
      "I;θS), (2)\n",
      "whereP\n",
      "Idenotes there could be multiple tasks or skills\n",
      "being distilled into one student model, LI(·;·)stands for a\n",
      "specific learning objective, ...\n",
      "\n",
      "PROCESSED TEXT:\n",
      "Tao Shen4, Reynold Cheng1, Jinyang Li1,\n",
      "Can Xu5, Dacheng Tao6, Tianyi Zhou2\n",
      "1The University of Hong Kong2University of Maryland3Microsoft\n",
      "4University of Technology Sydney5Peking University6The University of Sydney\n",
      "{shawnxxh,chongyangtao,hishentao }@gmail.com {minglii,tianyi }@umd.edu\n",
      "ckcheng@cs.hku.hk\n",
      "is meticulously structured around three foundational pillars: algorithm, skill, and verticalization – providing a comprehensive examination of knowledge distillation mechanisms, the enhancement of ...\n",
      "==========================================================================================\n",
      "\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Setting `pad_token_id` to `eos_token_id`:None for open-end generation.\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "INPUT TEXT:\n",
      " LLMs (Eq.1), and ‘Distillation,’\n",
      "centered on injecting this knowledge into student models\n",
      "(Eq.2). We will elaborate on these two processes in the\n",
      "subsequent sections.\n",
      "3.1 Knowledge\n",
      "This section focuses on the approaches to elicit knowledge\n",
      "from teacher LLMs. According to the manners to acquire\n",
      "knowledge, we divided them into Labeling ,Expansion ,DataCuration ,Feature ,Feedback , and Self-Knowledge . Figure 5\n",
      "shows an illustration of these knowledge elicitation meth-\n",
      "ods.\n",
      "3.1.1 Labeling\n",
      "Labeling...\n",
      "\n",
      "PROCESSED TEXT:\n",
      "Tao Shen4, Reynold Cheng1, Jinyang Li1,\n",
      "Can Xu5, Dacheng Tao6, Tianyi Zhou2\n",
      "1The University of Hong Kong2University of Maryland3Microsoft\n",
      "4University of Technology Sydney5Peking University6The University of Sydney\n",
      "{shawnxxh,chongyangtao,hishentao }@gmail.com {minglii,tianyi }@umd.edu\n",
      "ckcheng@cs.hku.hk\n",
      "is meticulously structured around three foundational pillars: algorithm, skill, and verticalization – providing a comprehensive examination of knowledge distillation mechanisms, the enhancement of ...\n",
      "==========================================================================================\n",
      "\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Setting `pad_token_id` to `eos_token_id`:None for open-end generation.\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "INPUT TEXT:\n",
      "lable through the\n",
      "predefined Iandc. This process can be formulated as\n",
      "follows:\n",
      "D(lab)={x, y|x∼ X, y∼pT(y|I⊕c⊕x)}. (3)\n",
      "Input xcould be sourced from existing NLP task\n",
      "datasets, which serve as typical reservoirs for distillation\n",
      "efforts. Numerous works have sought to harness the capa-\n",
      "bilities of powerful LLMs as teachers for annotating dataset\n",
      "samples across a range of tasks. For instance, efforts in\n",
      "natural language understanding involve using LLMs to cat-\n",
      "egorize text (Gilardi et al., 2023; Ding...\n",
      "\n",
      "PROCESSED TEXT:\n",
      "Tao Shen4, Reynold Cheng1, Jinyang Li1,\n",
      "Can Xu5, Dacheng Tao6, Tianyi Zhou2\n",
      "1The University of Hong Kong2University of Maryland3Microsoft\n",
      "4University of Technology Sydney5Peking University6The University of Sydney\n",
      "{shawnxxh,chongyangtao,hishentao }@gmail.com {minglii,tianyi }@umd.edu\n",
      "ckcheng@cs.hku.hk\n",
      "is meticulously structured around three foundational pillars: algorithm, skill, and verticalization – providing a comprehensive examination of knowledge distillation mechanisms, the enhancement of ...\n",
      "==========================================================================================\n",
      "\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Setting `pad_token_id` to `eos_token_id`:None for open-end generation.\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "INPUT TEXT:\n",
      "t al., 2023d;\n",
      "Liu et al., 2023g), among others. Rather than concentrating\n",
      "on specific tasks, many current works focus on labeling\n",
      "outputs based on instructions, thereby teaching student\n",
      "models to solve tasks in a more flexible way by following in-\n",
      "structions. Collections of various NLP tasks, complemented\n",
      "by instructional templates, serve as valuable input sources\n",
      "forx. For instance, FLAN-v2 collections (Longpre et al.,\n",
      "2023) offers extensive publicly available sets of tasks with\n",
      "instructions, w...\n",
      "\n",
      "PROCESSED TEXT:\n",
      "Tao Shen4, Reynold Cheng1, Jinyang Li1,\n",
      "Can Xu5, Dacheng Tao6, Tianyi Zhou2\n",
      "1The University of Hong Kong2University of Maryland3Microsoft\n",
      "4University of Technology Sydney5Peking University6The University of Sydney\n",
      "{shawnxxh,chongyangtao,hishentao }@gmail.com {minglii,tianyi }@umd.edu\n",
      "ckcheng@cs.hku.hk\n",
      "is meticulously structured around three foundational pillars: algorithm, skill, and verticalization – providing a comprehensive examination of knowledge distillation mechanisms, the enhancement of ...\n",
      "==========================================================================================\n",
      "\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Setting `pad_token_id` to `eos_token_id`:None for open-end generation.\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "INPUT TEXT:\n",
      "from forums like Quora and Stack Overflow.\n",
      "Moreover, the process of labeling could be guided by\n",
      "instructions Ior demonstrations c. A commonly used in-\n",
      "struction type for guiding labeling is chain-of-thought (CoT)\n",
      "prompt (Hsieh et al., 2023; Fu et al., 2023; Magister et al.,\n",
      "2023). Mukherjee et al. (2023) add multiple system messages\n",
      "(e.g. “You must generate a detailed and long answer.” or\n",
      "“explain like I’m five, think step-by-step”) to elicit rich\n",
      "signals. Yue et al. (2023a) and Chenglin et al. ...\n",
      "\n",
      "PROCESSED TEXT:\n",
      "Tao Shen4, Reynold Cheng1, Jinyang Li1,\n",
      "Can Xu5, Dacheng Tao6, Tianyi Zhou2\n",
      "1The University of Hong Kong2University of Maryland3Microsoft\n",
      "4University of Technology Sydney5Peking University6The University of Sydney\n",
      "{shawnxxh,chongyangtao,hishentao }@gmail.com {minglii,tianyi }@umd.edu\n",
      "ckcheng@cs.hku.hk\n",
      "is meticulously structured around three foundational pillars: algorithm, skill, and verticalization – providing a comprehensive examination of knowledge distillation mechanisms, the enhancement of ...\n",
      "==========================================================================================\n",
      "\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Setting `pad_token_id` to `eos_token_id`:None for open-end generation.\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "INPUT TEXT:\n",
      "acher LLMs. Labeling : The teacher generates\n",
      "the output from the input; Expansion : The teacher generates samples similar to the given demonstrations through in-\n",
      "context learning; Data Curation : The teacher synthesizes data according to meta-information, such as a topic or an entity;\n",
      "Feature : Feed the data into the teacher and extract its internal knowledge, such as logits and features; Feedback : The teacher\n",
      "provides feedback on the student’s generations, such as preferences, corrections, exp...\n",
      "\n",
      "PROCESSED TEXT:\n",
      "Tao Shen4, Reynold Cheng1, Jinyang Li1,\n",
      "Can Xu5, Dacheng Tao6, Tianyi Zhou2\n",
      "1The University of Hong Kong2University of Maryland3Microsoft\n",
      "4University of Technology Sydney5Peking University6The University of Sydney\n",
      "{shawnxxh,chongyangtao,hishentao }@gmail.com {minglii,tianyi }@umd.edu\n",
      "ckcheng@cs.hku.hk\n",
      "is meticulously structured around three foundational pillars: algorithm, skill, and verticalization – providing a comprehensive examination of knowledge distillation mechanisms, the enhancement of ...\n",
      "==========================================================================================\n",
      "\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Setting `pad_token_id` to `eos_token_id`:None for open-end generation.\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "INPUT TEXT:\n",
      " is constrained by the scale\n",
      "and variety of the input data. In real-world applications,\n",
      "especially those involving user conversations, there are also\n",
      "concerns regarding the privacy of the data involved. To\n",
      "address these limitations, various expansion methods have\n",
      "been proposed (Wang et al., 2022a; Taori et al., 2023; Chaud-\n",
      "hary, 2023; Si et al., 2023; Ji et al., 2023a; Luo et al., 2023b,a;\n",
      "Wu et al., 2023c; Sun et al., 2024b; Xu et al., 2023a; Guo\n",
      "et al., 2023c; Rozi `ere et al., 2023; West et ...\n",
      "\n",
      "PROCESSED TEXT:\n",
      "Tao Shen4, Reynold Cheng1, Jinyang Li1,\n",
      "Can Xu5, Dacheng Tao6, Tianyi Zhou2\n",
      "1The University of Hong Kong2University of Maryland3Microsoft\n",
      "4University of Technology Sydney5Peking University6The University of Sydney\n",
      "{shawnxxh,chongyangtao,hishentao }@gmail.com {minglii,tianyi }@umd.edu\n",
      "ckcheng@cs.hku.hk\n",
      "is meticulously structured around three foundational pillars: algorithm, skill, and verticalization – providing a comprehensive examination of knowledge distillation mechanisms, the enhancement of ...\n",
      "==========================================================================================\n",
      "\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Setting `pad_token_id` to `eos_token_id`:None for open-end generation.\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "INPUT TEXT:\n",
      "as follows:\n",
      "D(exp)={(x, y)|x∼pT(x|I⊕c), y∼pT(y|I⊕x)}.(4)\n",
      "In this formulation, xand yrepresent the new input-\n",
      "output pairs generated by the teacher LLM. The input x\n",
      "is generated based on a set of input-output demonstrations\n",
      "c. The output yis then generated in response to the new\n",
      "input xunder the guidance of an instruction I. Note thatthe demonstrations could be predefined or dynamically\n",
      "updated by adding the newly generated samples.\n",
      "Expansion techniques have been widely utilized to\n",
      "extract extens...\n",
      "\n",
      "PROCESSED TEXT:\n",
      "Tao Shen4, Reynold Cheng1, Jinyang Li1,\n",
      "Can Xu5, Dacheng Tao6, Tianyi Zhou2\n",
      "1The University of Hong Kong2University of Maryland3Microsoft\n",
      "4University of Technology Sydney5Peking University6The University of Sydney\n",
      "{shawnxxh,chongyangtao,hishentao }@gmail.com {minglii,tianyi }@umd.edu\n",
      "ckcheng@cs.hku.hk\n",
      "is meticulously structured around three foundational pillars: algorithm, skill, and verticalization – providing a comprehensive examination of knowledge distillation mechanisms, the enhancement of ...\n",
      "==========================================================================================\n",
      "\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Setting `pad_token_id` to `eos_token_id`:None for open-end generation.\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "INPUT TEXT:\n",
      " text-\n",
      "davinci-003, to distill 52K high-quality data. To improve\n",
      "the diversity and coverage during expansion, Wu et al.\n",
      "(2023c) and (Sun et al., 2024b) prompt the teacher LLM to\n",
      "generate instructions corresponding to some specific topics.\n",
      "Xu et al. (2023a) propose an Evol-Instruct method to ex-\n",
      "pand the instructions from two dimensions: difficulty (e.g.\n",
      "rewriting the question to be more complex) and diversity\n",
      "(e.g. generating more long-tailed instructions). This Evol-\n",
      "Instruct method is domain-a...\n",
      "\n",
      "PROCESSED TEXT:\n",
      "Tao Shen4, Reynold Cheng1, Jinyang Li1,\n",
      "Can Xu5, Dacheng Tao6, Tianyi Zhou2\n",
      "1The University of Hong Kong2University of Maryland3Microsoft\n",
      "4University of Technology Sydney5Peking University6The University of Sydney\n",
      "{shawnxxh,chongyangtao,hishentao }@gmail.com {minglii,tianyi }@umd.edu\n",
      "ckcheng@cs.hku.hk\n",
      "is meticulously structured around three foundational pillars: algorithm, skill, and verticalization – providing a comprehensive examination of knowledge distillation mechanisms, the enhancement of ...\n",
      "==========================================================================================\n",
      "\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Setting `pad_token_id` to `eos_token_id`:None for open-end generation.\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "INPUT TEXT:\n",
      "023b) proposes the Targeted Data Generation (TDG)\n",
      "framework, which automatically identifies challenging sub-\n",
      "groups within data and generates new samples for these\n",
      "subgroups using LLMs through in-context learning.\n",
      "In summary, the expansion method leverages the in-\n",
      "9\n",
      "context learning strengths of LLMs to produce more var-\n",
      "ied and extensive datasets with both inputs and outputs.\n",
      "However, the quality and diversity of the generated data\n",
      "are heavily reliant on the teacher LLMs and the initial seed\n",
      "de...\n",
      "\n",
      "PROCESSED TEXT:\n",
      "Tao Shen4, Reynold Cheng1, Jinyang Li1,\n",
      "Can Xu5, Dacheng Tao6, Tianyi Zhou2\n",
      "1The University of Hong Kong2University of Maryland3Microsoft\n",
      "4University of Technology Sydney5Peking University6The University of Sydney\n",
      "{shawnxxh,chongyangtao,hishentao }@gmail.com {minglii,tianyi }@umd.edu\n",
      "ckcheng@cs.hku.hk\n",
      "is meticulously structured around three foundational pillars: algorithm, skill, and verticalization – providing a comprehensive examination of knowledge distillation mechanisms, the enhancement of ...\n",
      "==========================================================================================\n",
      "\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Setting `pad_token_id` to `eos_token_id`:None for open-end generation.\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "INPUT TEXT:\n",
      "emergence\n",
      "of the Data Curation approach. This method arises in re-\n",
      "sponse to the limitations observed in both the Labeling and\n",
      "Expansion approaches. These methods often yield data of\n",
      "variable quality and face constraints in quantity. In Labeling,\n",
      "the seed knowledge is sourced from task datasets, leading\n",
      "to potential noise and dirty data. Meanwhile, in Expansion,\n",
      "the input xis derived from seed demonstrations, which\n",
      "can result in homogeneous data when generated in large\n",
      "quantities. To overcome th...\n",
      "\n",
      "PROCESSED TEXT:\n",
      "Tao Shen4, Reynold Cheng1, Jinyang Li1,\n",
      "Can Xu5, Dacheng Tao6, Tianyi Zhou2\n",
      "1The University of Hong Kong2University of Maryland3Microsoft\n",
      "4University of Technology Sydney5Peking University6The University of Sydney\n",
      "{shawnxxh,chongyangtao,hishentao }@gmail.com {minglii,tianyi }@umd.edu\n",
      "ckcheng@cs.hku.hk\n",
      "is meticulously structured around three foundational pillars: algorithm, skill, and verticalization – providing a comprehensive examination of knowledge distillation mechanisms, the enhancement of ...\n",
      "==========================================================================================\n",
      "\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Setting `pad_token_id` to `eos_token_id`:None for open-end generation.\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "INPUT TEXT:\n",
      "to this process to generate controllable x\n",
      "andy. Thus, this process can be meticulously controlled\n",
      "to yield datasets that are not only large in scale but also\n",
      "of high quality. The formulation for Data Curation can be\n",
      "represented as:\n",
      "D(cur)={(x, y)|x∼pT(x|I⊕m), y∼pT(y|I⊕x)}.(5)\n",
      "In this formulation, mrepresents the diverse meta-\n",
      "information used to guide the synthesis of x, and Iis the\n",
      "instruction guiding teacher LLMs to generate xory.\n",
      "Different studies primarily vary in their source and\n",
      "method of...\n",
      "\n",
      "PROCESSED TEXT:\n",
      "Tao Shen4, Reynold Cheng1, Jinyang Li1,\n",
      "Can Xu5, Dacheng Tao6, Tianyi Zhou2\n",
      "1The University of Hong Kong2University of Maryland3Microsoft\n",
      "4University of Technology Sydney5Peking University6The University of Sydney\n",
      "{shawnxxh,chongyangtao,hishentao }@gmail.com {minglii,tianyi }@umd.edu\n",
      "ckcheng@cs.hku.hk\n",
      "is meticulously structured around three foundational pillars: algorithm, skill, and verticalization – providing a comprehensive examination of knowledge distillation mechanisms, the enhancement of ...\n",
      "==========================================================================================\n",
      "\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Setting `pad_token_id` to `eos_token_id`:None for open-end generation.\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "INPUT TEXT:\n",
      " broad array\n",
      "of instructions and conversations, achieving a substantial\n",
      "scale of 1.5 million instances. UltraChat stands out with its\n",
      "lexical and topical diversity. The UltraLLaMA model, fine-\n",
      "tuned on this data, consistently surpasses other open-source\n",
      "models. Another notable series, phi(Gunasekar et al., 2023;\n",
      "Li et al., 2023a; Mar, 2023), focuses on distilling smaller,\n",
      "high-quality datasets akin to ”textbooks.” Phi-1 (Gunasekar\n",
      "et al., 2023) experiments with synthesizing ”textbook qual-\n",
      "ity” ...\n",
      "\n",
      "PROCESSED TEXT:\n",
      "Tao Shen4, Reynold Cheng1, Jinyang Li1,\n",
      "Can Xu5, Dacheng Tao6, Tianyi Zhou2\n",
      "1The University of Hong Kong2University of Maryland3Microsoft\n",
      "4University of Technology Sydney5Peking University6The University of Sydney\n",
      "{shawnxxh,chongyangtao,hishentao }@gmail.com {minglii,tianyi }@umd.edu\n",
      "ckcheng@cs.hku.hk\n",
      "is meticulously structured around three foundational pillars: algorithm, skill, and verticalization – providing a comprehensive examination of knowledge distillation mechanisms, the enhancement of ...\n",
      "==========================================================================================\n",
      "\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Setting `pad_token_id` to `eos_token_id`:None for open-end generation.\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "INPUT TEXT:\n",
      "coding benchmarks like Hu-\n",
      "manEval and MBPP while being 10 times smaller in model\n",
      "size and 100 times smaller in dataset size. MFTCoder (Liu\n",
      "et al., 2023d) utilizes hundreds of Python knowledge points\n",
      "as meta-information to create a CodeExercise Dataset. In\n",
      "contrast, Magicoder (Wei et al., 2023) and WaveCoder (Yu\n",
      "et al., 2024) get raw code collections from open-source\n",
      "code datasets, using this as meta-information for generating\n",
      "instructional data. In the context of NLU tasks, certain\n",
      "studies (Ye ...\n",
      "\n",
      "PROCESSED TEXT:\n",
      "Tao Shen4, Reynold Cheng1, Jinyang Li1,\n",
      "Can Xu5, Dacheng Tao6, Tianyi Zhou2\n",
      "1The University of Hong Kong2University of Maryland3Microsoft\n",
      "4University of Technology Sydney5Peking University6The University of Sydney\n",
      "{shawnxxh,chongyangtao,hishentao }@gmail.com {minglii,tianyi }@umd.edu\n",
      "ckcheng@cs.hku.hk\n",
      "is meticulously structured around three foundational pillars: algorithm, skill, and verticalization – providing a comprehensive examination of knowledge distillation mechanisms, the enhancement of ...\n",
      "==========================================================================================\n",
      "\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Setting `pad_token_id` to `eos_token_id`:None for open-end generation.\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "INPUT TEXT:\n",
      "ts\n",
      "that are not only high-quality and diverse but also large\n",
      "in scale. The success of models like phi-1 in specialized\n",
      "domains underscores the efficacy of this method. The ability\n",
      "to create synthetic datasets will become a crucial technical\n",
      "skill and a key area of focus in AI (Li et al., 2023a).\n",
      "3.1.4 Feature\n",
      "The previously discussed knowledge elicitation methods\n",
      "are typically applied to powerful black-box models, which\n",
      "are expensive and somewhat unreproducible due to calling\n",
      "API. In contrast, w...\n",
      "\n",
      "PROCESSED TEXT:\n",
      "Tao Shen4, Reynold Cheng1, Jinyang Li1,\n",
      "Can Xu5, Dacheng Tao6, Tianyi Zhou2\n",
      "1The University of Hong Kong2University of Maryland3Microsoft\n",
      "4University of Technology Sydney5Peking University6The University of Sydney\n",
      "{shawnxxh,chongyangtao,hishentao }@gmail.com {minglii,tianyi }@umd.edu\n",
      "ckcheng@cs.hku.hk\n",
      "is meticulously structured around three foundational pillars: algorithm, skill, and verticalization – providing a comprehensive examination of knowledge distillation mechanisms, the enhancement of ...\n",
      "==========================================================================================\n",
      "\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Setting `pad_token_id` to `eos_token_id`:None for open-end generation.\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "INPUT TEXT:\n",
      "e context of\n",
      "generative LLMs (Timiryasov and Tastet, 2023; Liang et al.,\n",
      "2023a; Gu et al., 2024; Agarwal et al., 2024; Liu et al., 2023a;\n",
      "Wen et al., 2023; Wan et al., 2024a; Zhao and Zhu, 2023; Qin\n",
      "et al., 2023b; Boizard et al., 2024; Zhong et al., 2024).\n",
      "The typical method for acquiring this feature knowledge\n",
      "involves teacher LLMs annotating the output sequence y\n",
      "with its internal representations. These annotations are then\n",
      "distilled into the student model using methods such as\n",
      "Kullback-Leible...\n",
      "\n",
      "PROCESSED TEXT:\n",
      "Tao Shen4, Reynold Cheng1, Jinyang Li1,\n",
      "Can Xu5, Dacheng Tao6, Tianyi Zhou2\n",
      "1The University of Hong Kong2University of Maryland3Microsoft\n",
      "4University of Technology Sydney5Peking University6The University of Sydney\n",
      "{shawnxxh,chongyangtao,hishentao }@gmail.com {minglii,tianyi }@umd.edu\n",
      "ckcheng@cs.hku.hk\n",
      "is meticulously structured around three foundational pillars: algorithm, skill, and verticalization – providing a comprehensive examination of knowledge distillation mechanisms, the enhancement of ...\n",
      "==========================================================================================\n",
      "\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Setting `pad_token_id` to `eos_token_id`:None for open-end generation.\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "INPUT TEXT:\n",
      "d dataset of sequences with\n",
      "token-level probability distributions (Sanh et al., 2019; Wen\n",
      "et al., 2023). To leverage the rich semantic and syntactic\n",
      "knowledge in intermediate layers of the teacher model,\n",
      "TED (Liang et al., 2023a) designs task-aware layer-wise\n",
      "distillation. They align the student’s hidden representations\n",
      "with those of the teacher at each layer, selectively extracting\n",
      "knowledge pertinent to the target task. Gu et al. (2024) and\n",
      "Agarwal et al. (2024) introduce a novel approach wher...\n",
      "\n",
      "PROCESSED TEXT:\n",
      "Tao Shen4, Reynold Cheng1, Jinyang Li1,\n",
      "Can Xu5, Dacheng Tao6, Tianyi Zhou2\n",
      "1The University of Hong Kong2University of Maryland3Microsoft\n",
      "4University of Technology Sydney5Peking University6The University of Sydney\n",
      "{shawnxxh,chongyangtao,hishentao }@gmail.com {minglii,tianyi }@umd.edu\n",
      "ckcheng@cs.hku.hk\n",
      "is meticulously structured around three foundational pillars: algorithm, skill, and verticalization – providing a comprehensive examination of knowledge distillation mechanisms, the enhancement of ...\n",
      "==========================================================================================\n",
      "\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Setting `pad_token_id` to `eos_token_id`:None for open-end generation.\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "INPUT TEXT:\n",
      "rve the original output\n",
      "distribution when quantizing the LLMs, ensuring minimal\n",
      "loss of performance. Additionally, feature knowledge could\n",
      "serve as a potent source for multi-teacher knowledge distil-\n",
      "lation. Timiryasov and Tastet (2023) leverages an ensemble\n",
      "of GPT-2 and LLaMA as teacher models to extract output\n",
      "distributions. Similarly, FuseLLM (Wan et al., 2024a) inno-\n",
      "vatively combines the capabilities of various LLMs through\n",
      "a weighted fusion of their output distributions, integrating\n",
      "them i...\n",
      "\n",
      "PROCESSED TEXT:\n",
      "Tao Shen4, Reynold Cheng1, Jinyang Li1,\n",
      "Can Xu5, Dacheng Tao6, Tianyi Zhou2\n",
      "1The University of Hong Kong2University of Maryland3Microsoft\n",
      "4University of Technology Sydney5Peking University6The University of Sydney\n",
      "{shawnxxh,chongyangtao,hishentao }@gmail.com {minglii,tianyi }@umd.edu\n",
      "ckcheng@cs.hku.hk\n",
      "is meticulously structured around three foundational pillars: algorithm, skill, and verticalization – providing a comprehensive examination of knowledge distillation mechanisms, the enhancement of ...\n",
      "==========================================================================================\n",
      "\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Setting `pad_token_id` to `eos_token_id`:None for open-end generation.\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "INPUT TEXT:\n",
      "ma-\n",
      "tion. While showing promise, especially in smaller models,\n",
      "its application is not suitable for black-box LLMs where\n",
      "internal parameters are inaccessible. Furthermore, student\n",
      "models distilled from white-box LLMs may underperform\n",
      "compared to their black-box counterparts, as the black-box\n",
      "teacher LLMs (e.g. GPT-4) tend to be more powerful.\n",
      "3.1.5 Feedback\n",
      "Most previous works predominantly focus on one-way\n",
      "knowledge transfer from the teacher to the student for\n",
      "imitation, without considering feed...\n",
      "\n",
      "PROCESSED TEXT:\n",
      "Tao Shen4, Reynold Cheng1, Jinyang Li1,\n",
      "Can Xu5, Dacheng Tao6, Tianyi Zhou2\n",
      "1The University of Hong Kong2University of Maryland3Microsoft\n",
      "4University of Technology Sydney5Peking University6The University of Sydney\n",
      "{shawnxxh,chongyangtao,hishentao }@gmail.com {minglii,tianyi }@umd.edu\n",
      "ckcheng@cs.hku.hk\n",
      "is meticulously structured around three foundational pillars: algorithm, skill, and verticalization – providing a comprehensive examination of knowledge distillation mechanisms, the enhancement of ...\n",
      "==========================================================================================\n",
      "\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Setting `pad_token_id` to `eos_token_id`:None for open-end generation.\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "INPUT TEXT:\n",
      "(x, y, ϕ fb(x, y;θT))|x∼ X, y∼pS(y|x)}, (7)\n",
      "where ydenotes the output generated by the student\n",
      "model in response to x, and ϕfb(·;θT))represents providing\n",
      "feedback from teacher LLMs. This operation evaluates thestudent’s output ygiven the input x, by offering assess-\n",
      "ment, corrective information, or other forms of guidance.\n",
      "This feedback knowledge can not only be distilled into\n",
      "the student to also generate feedback (such as creating a\n",
      "student preference model) but, more importantly, enable\n",
      "the st...\n",
      "\n",
      "PROCESSED TEXT:\n",
      "Tao Shen4, Reynold Cheng1, Jinyang Li1,\n",
      "Can Xu5, Dacheng Tao6, Tianyi Zhou2\n",
      "1The University of Hong Kong2University of Maryland3Microsoft\n",
      "4University of Technology Sydney5Peking University6The University of Sydney\n",
      "{shawnxxh,chongyangtao,hishentao }@gmail.com {minglii,tianyi }@umd.edu\n",
      "ckcheng@cs.hku.hk\n",
      "is meticulously structured around three foundational pillars: algorithm, skill, and verticalization – providing a comprehensive examination of knowledge distillation mechanisms, the enhancement of ...\n",
      "==========================================================================================\n",
      "\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Setting `pad_token_id` to `eos_token_id`:None for open-end generation.\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "INPUT TEXT:\n",
      "preferences could be distilled from teachers\n",
      "by prompting it with specific criteria. Bai et al. (2022a) in-\n",
      "troduce RLAIF for distilling harmlessness preferences from\n",
      "LLMs. This involves using an SFT-trained LLM to generate\n",
      "response pairs for each prompt, then ranking them for\n",
      "harmlessness to create a preference dataset. This dataset is\n",
      "distilled into a Preference Model (PM), which then guides\n",
      "the RL training of a more harmless LLM policy. Wizard-\n",
      "Math (Luo et al., 2023b) places emphasis on math...\n",
      "\n",
      "PROCESSED TEXT:\n",
      "Tao Shen4, Reynold Cheng1, Jinyang Li1,\n",
      "Can Xu5, Dacheng Tao6, Tianyi Zhou2\n",
      "1The University of Hong Kong2University of Maryland3Microsoft\n",
      "4University of Technology Sydney5Peking University6The University of Sydney\n",
      "{shawnxxh,chongyangtao,hishentao }@gmail.com {minglii,tianyi }@umd.edu\n",
      "ckcheng@cs.hku.hk\n",
      "is meticulously structured around three foundational pillars: algorithm, skill, and verticalization – providing a comprehensive examination of knowledge distillation mechanisms, the enhancement of ...\n",
      "==========================================================================================\n",
      "\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Setting `pad_token_id` to `eos_token_id`:None for open-end generation.\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "INPUT TEXT:\n",
      "-following, truthfulness, honesty and\n",
      "helpfulness.\n",
      "Beyond merely assessing student generations, teachers\n",
      "can also furnish extensive feedback on instances where\n",
      "students underperform. In Lion (Jiang et al., 2023b), teacher\n",
      "model pinpoints instructions that pose challenges to the\n",
      "student model, generating new, more difficult instructions\n",
      "aimed at bolstering the student’s abilities. PERsD (Chen\n",
      "et al., 2023a) showcases a method where teacher offers\n",
      "tailored refinement feedback on incorrect code sni...\n",
      "\n",
      "PROCESSED TEXT:\n",
      "Tao Shen4, Reynold Cheng1, Jinyang Li1,\n",
      "Can Xu5, Dacheng Tao6, Tianyi Zhou2\n",
      "1The University of Hong Kong2University of Maryland3Microsoft\n",
      "4University of Technology Sydney5Peking University6The University of Sydney\n",
      "{shawnxxh,chongyangtao,hishentao }@gmail.com {minglii,tianyi }@umd.edu\n",
      "ckcheng@cs.hku.hk\n",
      "is meticulously structured around three foundational pillars: algorithm, skill, and verticalization – providing a comprehensive examination of knowledge distillation mechanisms, the enhancement of ...\n",
      "==========================================================================================\n",
      "\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Setting `pad_token_id` to `eos_token_id`:None for open-end generation.\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "INPUT TEXT:\n",
      "ent an innovative strategy\n",
      "wherein the student model initially generates sequences,\n",
      "followed by teacher model producing an output distribution\n",
      "as feedback. This method leverages the teacher’s insight\n",
      "to directly inform and refine the student model’s learning\n",
      "process.\n",
      "3.1.6 Self-Knowledge\n",
      "The knowledge could also be elicited from the student itself,\n",
      "which we refer to as Self-Knowledge . In this setting, the same\n",
      "model acts both as the teacher and the student, iteratively\n",
      "improving itself by disti...\n",
      "\n",
      "PROCESSED TEXT:\n",
      "Tao Shen4, Reynold Cheng1, Jinyang Li1,\n",
      "Can Xu5, Dacheng Tao6, Tianyi Zhou2\n",
      "1The University of Hong Kong2University of Maryland3Microsoft\n",
      "4University of Technology Sydney5Peking University6The University of Sydney\n",
      "{shawnxxh,chongyangtao,hishentao }@gmail.com {minglii,tianyi }@umd.edu\n",
      "ckcheng@cs.hku.hk\n",
      "is meticulously structured around three foundational pillars: algorithm, skill, and verticalization – providing a comprehensive examination of knowledge distillation mechanisms, the enhancement of ...\n",
      "==========================================================================================\n",
      "\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Setting `pad_token_id` to `eos_token_id`:None for open-end generation.\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "INPUT TEXT:\n",
      "elf-generated outputs y, which\n",
      "could include but is not limited to filtering, rewarding, or\n",
      "any other mechanisms for enhancing or evaluating y. It\n",
      "could be governed by external tools or the student itself θS.\n",
      "Recent research in this area has proposed various innovative\n",
      "methodologies to elicit self-knowledge, demonstrating its\n",
      "potential for creating more efficient and autonomous learn-\n",
      "ing systems. (Allen-Zhu and Li, 2020; Wang et al., 2022a;\n",
      "Sun et al., 2024b; Yang et al., 2024a; Jung et al., 20...\n",
      "\n",
      "PROCESSED TEXT:\n",
      "Tao Shen4, Reynold Cheng1, Jinyang Li1,\n",
      "Can Xu5, Dacheng Tao6, Tianyi Zhou2\n",
      "1The University of Hong Kong2University of Maryland3Microsoft\n",
      "4University of Technology Sydney5Peking University6The University of Sydney\n",
      "{shawnxxh,chongyangtao,hishentao }@gmail.com {minglii,tianyi }@umd.edu\n",
      "ckcheng@cs.hku.hk\n",
      "is meticulously structured around three foundational pillars: algorithm, skill, and verticalization – providing a comprehensive examination of knowledge distillation mechanisms, the enhancement of ...\n",
      "==========================================================================================\n",
      "\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Setting `pad_token_id` to `eos_token_id`:None for open-end generation.\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "INPUT TEXT:\n",
      "ne-tunes the original\n",
      "model. Other methods aim to elicit targeted knowledge\n",
      "from student models by modifying prompts, and leveraging\n",
      "these data for further refinement. In Self-Align (Sun et al.,\n",
      "2024b), they find that models fine-tuned by Self-Instruct\n",
      "data tend to generate short or indirect responses. They\n",
      "prompt this model with verbose instruction to produce in-\n",
      "depth and detailed responses. Then, they employ context-\n",
      "distillation (Askell et al., 2021) to distill these responses\n",
      "paired with no...\n",
      "\n",
      "PROCESSED TEXT:\n",
      "Tao Shen4, Reynold Cheng1, Jinyang Li1,\n",
      "Can Xu5, Dacheng Tao6, Tianyi Zhou2\n",
      "1The University of Hong Kong2University of Maryland3Microsoft\n",
      "4University of Technology Sydney5Peking University6The University of Sydney\n",
      "{shawnxxh,chongyangtao,hishentao }@gmail.com {minglii,tianyi }@umd.edu\n",
      "ckcheng@cs.hku.hk\n",
      "is meticulously structured around three foundational pillars: algorithm, skill, and verticalization – providing a comprehensive examination of knowledge distillation mechanisms, the enhancement of ...\n",
      "==========================================================================================\n",
      "\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Setting `pad_token_id` to `eos_token_id`:None for open-end generation.\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "INPUT TEXT:\n",
      "\n",
      "tence summarization tasks, implementing filters based on\n",
      "entailment, length, and diversity to screen self-generated\n",
      "summaries. LMSI (Huang et al., 2023a) generates multiple\n",
      "CoT reasoning paths and answers for each question, and\n",
      "then retains only those paths that lead to the most consistent\n",
      "answer.\n",
      "Note that refined self-knowledge can be iteratively ac-\n",
      "quired as the student model continuously improves, further\n",
      "enhancing the student’s capabilities. This is Gulcehre et al.\n",
      "(2023) introduces a Rei...\n",
      "\n",
      "PROCESSED TEXT:\n",
      "Tao Shen4, Reynold Cheng1, Jinyang Li1,\n",
      "Can Xu5, Dacheng Tao6, Tianyi Zhou2\n",
      "1The University of Hong Kong2University of Maryland3Microsoft\n",
      "4University of Technology Sydney5Peking University6The University of Sydney\n",
      "{shawnxxh,chongyangtao,hishentao }@gmail.com {minglii,tianyi }@umd.edu\n",
      "ckcheng@cs.hku.hk\n",
      "is meticulously structured around three foundational pillars: algorithm, skill, and verticalization – providing a comprehensive examination of knowledge distillation mechanisms, the enhancement of ...\n",
      "==========================================================================================\n",
      "\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Setting `pad_token_id` to `eos_token_id`:None for open-end generation.\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "INPUT TEXT:\n",
      ",\n",
      "2024a) introduces a framework resembling iterative DPO,\n",
      "where the language model is fine-tuned to differentiate the\n",
      "self-generated responses from the human-annotated data.\n",
      "These self-generated responses could be seen as “negative\n",
      "knowledge” to promote the student to better align with\n",
      "the target distribution. Self-Rewarding (Yuan et al., 2024a)\n",
      "explores a novel and promising approach by utilizing the\n",
      "language model itself as a reward model. It employs LLM-\n",
      "as-a-Judge prompting to autonomously a...\n",
      "\n",
      "PROCESSED TEXT:\n",
      "Tao Shen4, Reynold Cheng1, Jinyang Li1,\n",
      "Can Xu5, Dacheng Tao6, Tianyi Zhou2\n",
      "1The University of Hong Kong2University of Maryland3Microsoft\n",
      "4University of Technology Sydney5Peking University6The University of Sydney\n",
      "{shawnxxh,chongyangtao,hishentao }@gmail.com {minglii,tianyi }@umd.edu\n",
      "ckcheng@cs.hku.hk\n",
      "is meticulously structured around three foundational pillars: algorithm, skill, and verticalization – providing a comprehensive examination of knowledge distillation mechanisms, the enhancement of ...\n",
      "==========================================================================================\n",
      "\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Setting `pad_token_id` to `eos_token_id`:None for open-end generation.\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "INPUT TEXT:\n",
      "g and Rank Optimization ,\n",
      "as shown in Figure 3.\n",
      "3.2.1 Supervised Fine-Tuning\n",
      "Supervised Fine-Tuning (SFT), or called Sequence-Level KD\n",
      "(SeqKD) (Kim and Rush, 2016), is the simplest and one of\n",
      "the most effective methods for distilling powerful black-box\n",
      "LLMs. SFT finetunes student model by maximizing the like-\n",
      "lihood of sequences generated by the teacher LLMs, aligning\n",
      "the student’s predictions with those of the teacher. This\n",
      "process can be mathematically formulated as minimizing\n",
      "the objective fu...\n",
      "\n",
      "PROCESSED TEXT:\n",
      "Tao Shen4, Reynold Cheng1, Jinyang Li1,\n",
      "Can Xu5, Dacheng Tao6, Tianyi Zhou2\n",
      "1The University of Hong Kong2University of Maryland3Microsoft\n",
      "4University of Technology Sydney5Peking University6The University of Sydney\n",
      "{shawnxxh,chongyangtao,hishentao }@gmail.com {minglii,tianyi }@umd.edu\n",
      "ckcheng@cs.hku.hk\n",
      "is meticulously structured around three foundational pillars: algorithm, skill, and verticalization – providing a comprehensive examination of knowledge distillation mechanisms, the enhancement of ...\n",
      "==========================================================================================\n",
      "\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Setting `pad_token_id` to `eos_token_id`:None for open-end generation.\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "INPUT TEXT:\n",
      ", 2022a;\n",
      "Huang et al., 2023c; Xu et al., 2023b; Zelikman et al., 2022).\n",
      "Due to the large number of KD works applying SFT, we\n",
      "only list representative ones here. More detailed works can\n",
      "be found in §4.\n",
      "3.2.2 Divergence and Similarity\n",
      "This section mainly concentrates on algorithms designed for\n",
      "distilling feature knowledge from white-box teacher LLMs,\n",
      "including distributions and hidden state features. These\n",
      "algorithms can be broadly categorized into two groups:\n",
      "those minimizing divergence in probab...\n",
      "\n",
      "PROCESSED TEXT:\n",
      "Tao Shen4, Reynold Cheng1, Jinyang Li1,\n",
      "Can Xu5, Dacheng Tao6, Tianyi Zhou2\n",
      "1The University of Hong Kong2University of Maryland3Microsoft\n",
      "4University of Technology Sydney5Peking University6The University of Sydney\n",
      "{shawnxxh,chongyangtao,hishentao }@gmail.com {minglii,tianyi }@umd.edu\n",
      "ckcheng@cs.hku.hk\n",
      "is meticulously structured around three foundational pillars: algorithm, skill, and verticalization – providing a comprehensive examination of knowledge distillation mechanisms, the enhancement of ...\n",
      "==========================================================================================\n",
      "\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Setting `pad_token_id` to `eos_token_id`:None for open-end generation.\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "INPUT TEXT:\n",
      "(x, y))∥2\n",
      "L1-Norm Distance ∥ΦT(fT(x, y))−ΦS(fS(x, y))∥1\n",
      "Cross-Entropy Loss −PΦT(fT(x, y)) log(Φ S(fS(x, y)))\n",
      "Maximum Mean Discrepancy MMD (ΦT(fT(x, y)),ΦS(fS(x, y)))\n",
      "TABLE 2: Summary of similarity functions in knowledge\n",
      "distillation.\n",
      "and student models, represented by a general divergence\n",
      "function D:\n",
      "LDiv= E\n",
      "x∼X,y∼Y[D(pT(y|x), pS(y|x))], (10)\n",
      "The specific form of Dvaries depending on the type of\n",
      "divergence employed. Table 1 outlines the functional forms\n",
      "ofDfor different divergence measures. The ...\n",
      "\n",
      "PROCESSED TEXT:\n",
      "Tao Shen4, Reynold Cheng1, Jinyang Li1,\n",
      "Can Xu5, Dacheng Tao6, Tianyi Zhou2\n",
      "1The University of Hong Kong2University of Maryland3Microsoft\n",
      "4University of Technology Sydney5Peking University6The University of Sydney\n",
      "{shawnxxh,chongyangtao,hishentao }@gmail.com {minglii,tianyi }@umd.edu\n",
      "ckcheng@cs.hku.hk\n",
      "is meticulously structured around three foundational pillars: algorithm, skill, and verticalization – providing a comprehensive examination of knowledge distillation mechanisms, the enhancement of ...\n",
      "==========================================================================================\n",
      "\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Setting `pad_token_id` to `eos_token_id`:None for open-end generation.\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "INPUT TEXT:\n",
      "ss to tokens with low probability\n",
      "under the teacher’s distribution (cf. Figure 6 blue curve).\n",
      "This mode-covering phenomenon can potentially lead to\n",
      "hallucinations and low-quality generations. Alternatively,\n",
      "mode-seeking divergences like reverse KL prioritize tokens\n",
      "where the teacher assigns high probabilities (cf. Figure 6\n",
      "green curve). This approach can mitigate the risk of low-\n",
      "quality outputs, fostering more accurate generations. How-\n",
      "ever, it often does so at the cost of reduced diversity. G...\n",
      "\n",
      "PROCESSED TEXT:\n",
      "Tao Shen4, Reynold Cheng1, Jinyang Li1,\n",
      "Can Xu5, Dacheng Tao6, Tianyi Zhou2\n",
      "1The University of Hong Kong2University of Maryland3Microsoft\n",
      "4University of Technology Sydney5Peking University6The University of Sydney\n",
      "{shawnxxh,chongyangtao,hishentao }@gmail.com {minglii,tianyi }@umd.edu\n",
      "ckcheng@cs.hku.hk\n",
      "is meticulously structured around three foundational pillars: algorithm, skill, and verticalization – providing a comprehensive examination of knowledge distillation mechanisms, the enhancement of ...\n",
      "==========================================================================================\n",
      "\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Setting `pad_token_id` to `eos_token_id`:None for open-end generation.\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "INPUT TEXT:\n",
      "r variations, while reverse KL\n",
      "divergence is preferable for tasks like dialogue generation\n",
      "and instruction tuning, which involve multiple modes and\n",
      "a wider range of potential responses. Thus, the nature of the\n",
      "task significantly influences the selection of the divergence\n",
      "function for optimal performance.\n",
      "Similarity. Similarity-based methods in knowledge distilla-\n",
      "tion aim to align the hidden states or features of the student\n",
      "pargminqKL(p||q)argminqKL(q||p)Fig. 6: Comparison of Forward and Revers...\n",
      "\n",
      "PROCESSED TEXT:\n",
      "Tao Shen4, Reynold Cheng1, Jinyang Li1,\n",
      "Can Xu5, Dacheng Tao6, Tianyi Zhou2\n",
      "1The University of Hong Kong2University of Maryland3Microsoft\n",
      "4University of Technology Sydney5Peking University6The University of Sydney\n",
      "{shawnxxh,chongyangtao,hishentao }@gmail.com {minglii,tianyi }@umd.edu\n",
      "ckcheng@cs.hku.hk\n",
      "is meticulously structured around three foundational pillars: algorithm, skill, and verticalization – providing a comprehensive examination of knowledge distillation mechanisms, the enhancement of ...\n",
      "==========================================================================================\n",
      "\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Setting `pad_token_id` to `eos_token_id`:None for open-end generation.\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "INPUT TEXT:\n",
      "tive is to ensure that the student model not only\n",
      "produces similar outputs to the teacher but also processes\n",
      "information in a comparable manner. The formulation for a\n",
      "similarity-based objective might look like this:\n",
      "LSim= E\n",
      "x∼X,y∼Y[LF(ΦT(fT(x, y)),ΦS(fS(x, y)))],(11)\n",
      "where fT(x, y)andfS(x, y)are the feature maps of the\n",
      "teacher and student models, respectively. The transforma-\n",
      "tion functions ΦTandΦSare applied to these feature maps\n",
      "to ensure they are in the same shape, facilitating direct\n",
      "compari...\n",
      "\n",
      "PROCESSED TEXT:\n",
      "Tao Shen4, Reynold Cheng1, Jinyang Li1,\n",
      "Can Xu5, Dacheng Tao6, Tianyi Zhou2\n",
      "1The University of Hong Kong2University of Maryland3Microsoft\n",
      "4University of Technology Sydney5Peking University6The University of Sydney\n",
      "{shawnxxh,chongyangtao,hishentao }@gmail.com {minglii,tianyi }@umd.edu\n",
      "ckcheng@cs.hku.hk\n",
      "is meticulously structured around three foundational pillars: algorithm, skill, and verticalization – providing a comprehensive examination of knowledge distillation mechanisms, the enhancement of ...\n",
      "==========================================================================================\n",
      "\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Setting `pad_token_id` to `eos_token_id`:None for open-end generation.\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "INPUT TEXT:\n",
      "iltered\n",
      "representations in both teacher and student models. While\n",
      "similarity-based approaches are common in encoder-based\n",
      "LMs (Sun et al., 2019, 2020; Jiao et al., 2020; Hou et al.,\n",
      "2020; Zuo et al., 2022; Liang et al., 2021), their application in\n",
      "LLM knowledge distillation is not as widespread. However,\n",
      "considering their effectiveness, we anticipate an increase in\n",
      "research exploring these methods for LLM distillation in the\n",
      "near future.\n",
      "3.2.3 Reinforcement Learning\n",
      "This section explores advance...\n",
      "\n",
      "PROCESSED TEXT:\n",
      "Tao Shen4, Reynold Cheng1, Jinyang Li1,\n",
      "Can Xu5, Dacheng Tao6, Tianyi Zhou2\n",
      "1The University of Hong Kong2University of Maryland3Microsoft\n",
      "4University of Technology Sydney5Peking University6The University of Sydney\n",
      "{shawnxxh,chongyangtao,hishentao }@gmail.com {minglii,tianyi }@umd.edu\n",
      "ckcheng@cs.hku.hk\n",
      "is meticulously structured around three foundational pillars: algorithm, skill, and verticalization – providing a comprehensive examination of knowledge distillation mechanisms, the enhancement of ...\n",
      "==========================================================================================\n",
      "\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Setting `pad_token_id` to `eos_token_id`:None for open-end generation.\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "INPUT TEXT:\n",
      "model rϕusing the feedback data D(fd)\n",
      "generated by teacher LLMs. Preference data, as one of the\n",
      "typical feedback, is employed to train the student reward\n",
      "model (Bai et al., 2022a; Cui et al., 2023a; Lee et al., 2023a;\n",
      "Kim et al., 2023a). They usually consist of input-output\n",
      "pairs (x, yw, yl). Here, ywandylrepresent “winning” and\n",
      "“losing” outputs relative to the teacher’s preferences. The\n",
      "loss function for the reward model is defined as:\n",
      "LRM(rϕ,D(fd)) =− E\n",
      "(x,yw,yl)∼D(fd)[logσ(rϕ(x, yw)−rϕ(x, yl)...\n",
      "\n",
      "PROCESSED TEXT:\n",
      "Tao Shen4, Reynold Cheng1, Jinyang Li1,\n",
      "Can Xu5, Dacheng Tao6, Tianyi Zhou2\n",
      "1The University of Hong Kong2University of Maryland3Microsoft\n",
      "4University of Technology Sydney5Peking University6The University of Sydney\n",
      "{shawnxxh,chongyangtao,hishentao }@gmail.com {minglii,tianyi }@umd.edu\n",
      "ckcheng@cs.hku.hk\n",
      "is meticulously structured around three foundational pillars: algorithm, skill, and verticalization – providing a comprehensive examination of knowledge distillation mechanisms, the enhancement of ...\n",
      "==========================================================================================\n",
      "\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Setting `pad_token_id` to `eos_token_id`:None for open-end generation.\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "INPUT TEXT:\n",
      "second stage,\n",
      "the student model, represented by a policy πθ, is optimized\n",
      "to maximize the expected reward as per the trained reward\n",
      "model. Simultaneously, it minimizes the divergence from\n",
      "a reference policy πref, typically the initial policy of the\n",
      "student model trained by SFT, controlled by a factor β. The\n",
      "RL objective is given by:\n",
      "max\n",
      "πθE\n",
      "x∼X,y∼πθ(y|x)[rϕ(x, y)]−βDKL[πθ(y|x)∥πref(y|x)]\n",
      "(13)\n",
      "This RL framework not only ensures that the student model\n",
      "learns the explicit content from the teacher b...\n",
      "\n",
      "PROCESSED TEXT:\n",
      "Tao Shen4, Reynold Cheng1, Jinyang Li1,\n",
      "Can Xu5, Dacheng Tao6, Tianyi Zhou2\n",
      "1The University of Hong Kong2University of Maryland3Microsoft\n",
      "4University of Technology Sydney5Peking University6The University of Sydney\n",
      "{shawnxxh,chongyangtao,hishentao }@gmail.com {minglii,tianyi }@umd.edu\n",
      "ckcheng@cs.hku.hk\n",
      "is meticulously structured around three foundational pillars: algorithm, skill, and verticalization – providing a comprehensive examination of knowledge distillation mechanisms, the enhancement of ...\n",
      "==========================================================================================\n",
      "\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Setting `pad_token_id` to `eos_token_id`:None for open-end generation.\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "INPUT TEXT:\n",
      "putational cost compared to employing a\n",
      "smaller distilled reward model.\n",
      "3.2.4 Ranking Optimization\n",
      "Ranking optimization presents a stable and computationally\n",
      "efficient alternative to RL for injecting preference feedback\n",
      "into language models (Rafailov et al., 2023; Song et al.,\n",
      "2023a; Yuan et al., 2023b). This method, diverging from\n",
      "traditional RL approaches, directly incorporates ranking\n",
      "information into language models from a fixed preference\n",
      "dataset during fine-tuning. Intuitively, it directly...\n",
      "\n",
      "PROCESSED TEXT:\n",
      "Tao Shen4, Reynold Cheng1, Jinyang Li1,\n",
      "Can Xu5, Dacheng Tao6, Tianyi Zhou2\n",
      "1The University of Hong Kong2University of Maryland3Microsoft\n",
      "4University of Technology Sydney5Peking University6The University of Sydney\n",
      "{shawnxxh,chongyangtao,hishentao }@gmail.com {minglii,tianyi }@umd.edu\n",
      "ckcheng@cs.hku.hk\n",
      "is meticulously structured around three foundational pillars: algorithm, skill, and verticalization – providing a comprehensive examination of knowledge distillation mechanisms, the enhancement of ...\n",
      "==========================================================================================\n",
      "\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Setting `pad_token_id` to `eos_token_id`:None for open-end generation.\n",
      "Setting `pad_token_id` to `eos_token_id`:None for open-end generation.\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "INPUT TEXT:\n",
      "v et al., 2023) to distill the\n",
      "preference alignment in teacher LLMs. DPO streamlines\n",
      "the objective of reinforcement learning (as in Eq. 13),\n",
      "which involves reward maximization with a KL-divergence\n",
      "constraint, into a single-stage policy training. Specifically,\n",
      "DPO’s training goal is to maximize the following expecta-\n",
      "tion:\n",
      "E\n",
      "(x,yw,yl)∼D(fd)\u0014\n",
      "logσ\u0012\n",
      "βlogπθ(yw|x)\n",
      "πref(yw|x)−βlogπθ(yl|x)\n",
      "πref(yl|x)\u0013\u0015\n",
      ",\n",
      "(14)\n",
      "where ywis preferred over ylaccording to the teacher\n",
      "LLM. Hong et al. (2023) (Hong et al., 202...\n",
      "\n",
      "PROCESSED TEXT:\n",
      "Tao Shen4, Reynold Cheng1, Jinyang Li1,\n",
      "Can Xu5, Dacheng Tao6, Tianyi Zhou2\n",
      "1The University of Hong Kong2University of Maryland3Microsoft\n",
      "4University of Technology Sydney5Peking University6The University of Sydney\n",
      "{shawnxxh,chongyangtao,hishentao }@gmail.com {minglii,tianyi }@umd.edu\n",
      "ckcheng@cs.hku.hk\n",
      "is meticulously structured around three foundational pillars: algorithm, skill, and verticalization – providing a comprehensive examination of knowledge distillation mechanisms, the enhancement of ...\n",
      "==========================================================================================\n",
      "\n",
      "INPUT TEXT:\n",
      " approach emphasizes direct comparison\n",
      "and ranking of responses based on the teacher’s preferences.\n",
      "PRO (Song et al., 2023a) expands the concept of pairwise\n",
      "comparison to handle preference rankings of any length. For\n",
      "a given instruction xand a sequence of responses ordered by\n",
      "teacher preference as y1≻y2≻...≻yn, the RPO training\n",
      "objective is:\n",
      "LPRO=−n−1X\n",
      "k=1logexp (pk)Pn\n",
      "i=kexp (pi), (16)\n",
      "where pkrepresents the conditional log probabilities for\n",
      "ykunder the student policy πθ. By iteratively contras...\n",
      "\n",
      "PROCESSED TEXT:\n",
      "Tao Shen4, Reynold Cheng1, Jinyang Li1,\n",
      "Can Xu5, Dacheng Tao6, Tianyi Zhou2\n",
      "1The University of Hong Kong2University of Maryland3Microsoft\n",
      "4University of Technology Sydney5Peking University6The University of Sydney\n",
      "{shawnxxh,chongyangtao,hishentao }@gmail.com {minglii,tianyi }@umd.edu\n",
      "ckcheng@cs.hku.hk\n",
      "is meticulously structured around three foundational pillars: algorithm, skill, and verticalization – providing a comprehensive examination of knowledge distillation mechanisms, the enhancement of ...\n",
      "==========================================================================================\n",
      "\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Setting `pad_token_id` to `eos_token_id`:None for open-end generation.\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "INPUT TEXT:\n",
      "uding\n",
      "Context Following ,Alignment ,Agent ,NLP Task Specializa-\n",
      "tion and Multi-Modality .Context Following focuses on the\n",
      "student’s ability to comprehend and respond effectively\n",
      "to input information. Alignment delves into the student’s\n",
      "capability to align its output with the teacher’s responses.\n",
      "Moving forward, Agent underscores the autonomous nature\n",
      "of language models. NLP Task Specialization highlights the\n",
      "LLM’s versatility in specializing across various Natural\n",
      "Language Processing tasks, demo...\n",
      "\n",
      "PROCESSED TEXT:\n",
      "Tao Shen4, Reynold Cheng1, Jinyang Li1,\n",
      "Can Xu5, Dacheng Tao6, Tianyi Zhou2\n",
      "1The University of Hong Kong2University of Maryland3Microsoft\n",
      "4University of Technology Sydney5Peking University6The University of Sydney\n",
      "{shawnxxh,chongyangtao,hishentao }@gmail.com {minglii,tianyi }@umd.edu\n",
      "ckcheng@cs.hku.hk\n",
      "is meticulously structured around three foundational pillars: algorithm, skill, and verticalization – providing a comprehensive examination of knowledge distillation mechanisms, the enhancement of ...\n",
      "==========================================================================================\n",
      "\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Setting `pad_token_id` to `eos_token_id`:None for open-end generation.\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "INPUT TEXT:\n",
      "A Expansion SFT\n",
      "Lion (Jiang et al., 2023b) IF Alpaca Cata ChatGPT LLaMA Labeling + Expansion + Feedback -\n",
      "BabyLlama (Timiryasov and Tastet, 2023) IF 10M-word BabyLM dataset GPT-2 + small LLaMA 58M-parameter LLaMA Feature D&S\n",
      "MiniLLM (Gu et al., 2024) IF Dolly Dataset GPT2 + OPT + LLaMA GPT2 + OPT + LLaMA Feature D&S\n",
      "Self-Align (Sun et al., 2024b) IF Human-written Principles LLaMA LLaMA Expansion + Self-Knowledge SFT\n",
      "Self-Rewarding (Yuan et al., 2024a) IF Human-written Samples LLaMA LLaMA Self-Kn...\n",
      "\n",
      "PROCESSED TEXT:\n",
      "Tao Shen4, Reynold Cheng1, Jinyang Li1,\n",
      "Can Xu5, Dacheng Tao6, Tianyi Zhou2\n",
      "1The University of Hong Kong2University of Maryland3Microsoft\n",
      "4University of Technology Sydney5Peking University6The University of Sydney\n",
      "{shawnxxh,chongyangtao,hishentao }@gmail.com {minglii,tianyi }@umd.edu\n",
      "ckcheng@cs.hku.hk\n",
      "is meticulously structured around three foundational pillars: algorithm, skill, and verticalization – providing a comprehensive examination of knowledge distillation mechanisms, the enhancement of ...\n",
      "==========================================================================================\n",
      "\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Setting `pad_token_id` to `eos_token_id`:None for open-end generation.\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "INPUT TEXT:\n",
      " Human Conversation ChatGPT LLaMA Labeling SFT\n",
      "Baize (Xu et al., 2023b) IF/MD Quora + Stack Overflow ChatGPT LLaMA Expansion + Self-Knowledge SFT\n",
      "UltraChat (Ding et al., 2023b) IF/MD Wikidata + Text Material + C4 ChatGPT LLaMA Curation SFT\n",
      "Orca (Mukherjee et al., 2023) IF/TP FLAN-v2 ChatGPT + GPT4 LLaMA Labeling SFT\n",
      "Orca2 (Mitra et al., 2023) IF/TP FLAN-v2 + Few-Shot/Math/Synthetic GPT4 LLaMA Labeling SFT\n",
      "SelFee (Ye et al., 2023) IF/TP Human Conv, Flan/Code/Math Collection ChatGPT LLaMA Labeling...\n",
      "\n",
      "PROCESSED TEXT:\n",
      "Tao Shen4, Reynold Cheng1, Jinyang Li1,\n",
      "Can Xu5, Dacheng Tao6, Tianyi Zhou2\n",
      "1The University of Hong Kong2University of Maryland3Microsoft\n",
      "4University of Technology Sydney5Peking University6The University of Sydney\n",
      "{shawnxxh,chongyangtao,hishentao }@gmail.com {minglii,tianyi }@umd.edu\n",
      "ckcheng@cs.hku.hk\n",
      "is meticulously structured around three foundational pillars: algorithm, skill, and verticalization – providing a comprehensive examination of knowledge distillation mechanisms, the enhancement of ...\n",
      "==========================================================================================\n",
      "\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Setting `pad_token_id` to `eos_token_id`:None for open-end generation.\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "INPUT TEXT:\n",
      "MA Label SFT\n",
      "KARD (Kang et al., 2023b) IF/RAG MedQAUSMLE ChatGPT T5 + OPT Label SFT + D&S\n",
      "Self-RAG (Asai et al., 2023) IF/RAG Open-Instruct GPT4 LLaMA Labeling SFT\n",
      "Alignment\n",
      "OpenChat (Wang et al., 2023c) IF/Preference Human Conversation ChatGPT + GPT4 LLaMA Labeling SFT + RL\n",
      "Zephyr (Tunstall et al., 2023) IF/Preference Mixed Datasets GPT4 Mistral Labeling + Feedback SFT + RO\n",
      "ALMoST (Kim et al., 2023a) IF/Preference Human-written Prompts LLaMA LLaMA Expansion + Labeling SFT + RL\n",
      "RLCD (Yang et al....\n",
      "\n",
      "PROCESSED TEXT:\n",
      "Tao Shen4, Reynold Cheng1, Jinyang Li1,\n",
      "Can Xu5, Dacheng Tao6, Tianyi Zhou2\n",
      "1The University of Hong Kong2University of Maryland3Microsoft\n",
      "4University of Technology Sydney5Peking University6The University of Sydney\n",
      "{shawnxxh,chongyangtao,hishentao }@gmail.com {minglii,tianyi }@umd.edu\n",
      "ckcheng@cs.hku.hk\n",
      "is meticulously structured around three foundational pillars: algorithm, skill, and verticalization – providing a comprehensive examination of knowledge distillation mechanisms, the enhancement of ...\n",
      "==========================================================================================\n",
      "\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Setting `pad_token_id` to `eos_token_id`:None for open-end generation.\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "INPUT TEXT:\n",
      "itten Prompts Self-defined Student Model Self-defined Model Labeling + Expansion + Feedback SFT + RL\n",
      "SANDBOX (Liu et al., 2023b) Value Simulationtext-davinci-002/-003 +\n",
      "GPT4 + ChatGPTLLaMA Data Curation SFT + RL\n",
      "Agent\n",
      "Toolformer (Schick et al., 2023) Tool CCNet GPT-J GPT-J Labeling SFT\n",
      "Graph-ToolFormer (Zhang, 2023) Tool Mixed Graph Dataset ChatGPT GPT-J + LLaMA Labeling SFT\n",
      "Gorilla (Patil et al., 2023) Tool Online API Documentation GPT4 LLaMA Expansion SFT\n",
      "GPT4Tools (Yang et al., 2023b) Tool Im...\n",
      "\n",
      "PROCESSED TEXT:\n",
      "Tao Shen4, Reynold Cheng1, Jinyang Li1,\n",
      "Can Xu5, Dacheng Tao6, Tianyi Zhou2\n",
      "1The University of Hong Kong2University of Maryland3Microsoft\n",
      "4University of Technology Sydney5Peking University6The University of Sydney\n",
      "{shawnxxh,chongyangtao,hishentao }@gmail.com {minglii,tianyi }@umd.edu\n",
      "ckcheng@cs.hku.hk\n",
      "is meticulously structured around three foundational pillars: algorithm, skill, and verticalization – providing a comprehensive examination of knowledge distillation mechanisms, the enhancement of ...\n",
      "==========================================================================================\n",
      "\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Setting `pad_token_id` to `eos_token_id`:None for open-end generation.\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "INPUT TEXT:\n",
      "3a) Planning Mixed Interactive Tasks GPT4 LLaMA Labeling SFT\n",
      "AUTOACT (Qiao et al., 2024) Planning Mixed QA Tasks LLaMA LLaMA Labeling SFT\n",
      "NLP Task Specialization\n",
      "AugGPT (Dai et al., 2023a) NLU Amazon/Symptoms/PubMed20k Dataset ChatGPT BERT Label SFT\n",
      "TDG (He et al., 2023b) NLU SST + QQP + MNLI GPT3 BERT Expansion SFT\n",
      "SunGen (Gao et al., 2023a) NLU Text Classification Tasks GPT2 DistilBERT Curation SFT\n",
      "UDG (Wang et al., 2021a) NLU NLU Tasks GPT3 BERT Expansion SFT\n",
      "InheritSumm (Xu et al., 2023c) NL...\n",
      "\n",
      "PROCESSED TEXT:\n",
      "Tao Shen4, Reynold Cheng1, Jinyang Li1,\n",
      "Can Xu5, Dacheng Tao6, Tianyi Zhou2\n",
      "1The University of Hong Kong2University of Maryland3Microsoft\n",
      "4University of Technology Sydney5Peking University6The University of Sydney\n",
      "{shawnxxh,chongyangtao,hishentao }@gmail.com {minglii,tianyi }@umd.edu\n",
      "ckcheng@cs.hku.hk\n",
      "is meticulously structured around three foundational pillars: algorithm, skill, and verticalization – providing a comprehensive examination of knowledge distillation mechanisms, the enhancement of ...\n",
      "==========================================================================================\n",
      "\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Setting `pad_token_id` to `eos_token_id`:None for open-end generation.\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "INPUT TEXT:\n",
      "tGPT LLaMA Labeling SFT\n",
      "RankZephyr (Pradeep et al., 2023b) IR IR Datasets ChatGPT + GPT4 Mistral Labeling SFT\n",
      "NDR (Mysore et al., 2023) Recommendation Recommendation Datasets GPT3 MPnet-110M Labeling SFT\n",
      "InstrcutRec (Zhang et al., 2023b) Recommendation 39 instruction templates ChatGPT Flan-T5 Expansion + Self-Knowledge SFT\n",
      "ONCE (Liu et al., 2023c) Recommendation Recommendation Dataset ChatGPT LLaMA Labeling SFT\n",
      "PandaLM (Wang et al., 2023b) Evaluation Alpaca Data ChatGPT LLaMA Labeling SFT\n",
      "Promet...\n",
      "\n",
      "PROCESSED TEXT:\n",
      "Tao Shen4, Reynold Cheng1, Jinyang Li1,\n",
      "Can Xu5, Dacheng Tao6, Tianyi Zhou2\n",
      "1The University of Hong Kong2University of Maryland3Microsoft\n",
      "4University of Technology Sydney5Peking University6The University of Sydney\n",
      "{shawnxxh,chongyangtao,hishentao }@gmail.com {minglii,tianyi }@umd.edu\n",
      "ckcheng@cs.hku.hk\n",
      "is meticulously structured around three foundational pillars: algorithm, skill, and verticalization – providing a comprehensive examination of knowledge distillation mechanisms, the enhancement of ...\n",
      "==========================================================================================\n",
      "\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Setting `pad_token_id` to `eos_token_id`:None for open-end generation.\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "INPUT TEXT:\n",
      "tarCoder Expansion SFT\n",
      "Magicoder (Wei et al., 2023) Code Existing Source Codes ChatGPT LLaMa Curation SFT\n",
      "WaveCoder (Yu et al., 2024) Code Existing Source Codes GPT4 LLaMa Curation SFT\n",
      "Code Alpaca (Chaudhary, 2023) Code Code Instructions ChatGPT LLaMA Expansion + Self-Knowledge SFT\n",
      "Code Llama (Rozi `ere et al., 2023) Code Human-written Instructions LLaMA LLaMA Expansion + Self-Knowledge SFT\n",
      "Code Clean (Jain et al., 2023) Code Code Datasets ChatGPT LLaMA Labeling SFT\n",
      "Multi-Modality\n",
      "LLaVA (Liu et ...\n",
      "\n",
      "PROCESSED TEXT:\n",
      "Tao Shen4, Reynold Cheng1, Jinyang Li1,\n",
      "Can Xu5, Dacheng Tao6, Tianyi Zhou2\n",
      "1The University of Hong Kong2University of Maryland3Microsoft\n",
      "4University of Technology Sydney5Peking University6The University of Sydney\n",
      "{shawnxxh,chongyangtao,hishentao }@gmail.com {minglii,tianyi }@umd.edu\n",
      "ckcheng@cs.hku.hk\n",
      "is meticulously structured around three foundational pillars: algorithm, skill, and verticalization – providing a comprehensive examination of knowledge distillation mechanisms, the enhancement of ...\n",
      "==========================================================================================\n",
      "\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Setting `pad_token_id` to `eos_token_id`:None for open-end generation.\n",
      "Setting `pad_token_id` to `eos_token_id`:None for open-end generation.\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "INPUT TEXT:\n",
      "atBridge (Zhao et al., 2023d) Multiple Modalities Task-Specific/Multimodal-Chat Data GPT4 + ChatGPT LLaMA Labeling SFT\n",
      "TABLE 3: A summary of skill distillation works. IF: Instruction Following, MD: Multi-turn Dialoue, TP: Think Pattern,\n",
      "RAG: Retrieval-Augmented Generation, NLU: Natural Language Understanding, NLG: Natural Language Generation, IR:\n",
      "Information Retrieval, SFT: Supervised Fine-Tuning, D&S: Divergence and Similarity, RL: Reinforcement Learning, RO:\n",
      "Ranking Optimization.\n",
      "Finally, Mult...\n",
      "\n",
      "PROCESSED TEXT:\n",
      "Tao Shen4, Reynold Cheng1, Jinyang Li1,\n",
      "Can Xu5, Dacheng Tao6, Tianyi Zhou2\n",
      "1The University of Hong Kong2University of Maryland3Microsoft\n",
      "4University of Technology Sydney5Peking University6The University of Sydney\n",
      "{shawnxxh,chongyangtao,hishentao }@gmail.com {minglii,tianyi }@umd.edu\n",
      "ckcheng@cs.hku.hk\n",
      "is meticulously structured around three foundational pillars: algorithm, skill, and verticalization – providing a comprehensive examination of knowledge distillation mechanisms, the enhancement of ...\n",
      "==========================================================================================\n",
      "\n",
      "INPUT TEXT:\n",
      "t demonstrations, intricate instructions, dia-\n",
      "logue history, and retrieval-augmented information — into\n",
      "smaller models. Many research efforts in this domain aim\n",
      "to imbue smaller models with these sophisticated, context-\n",
      "15\n",
      "following capabilities. Our discussion here will dissect this\n",
      "facet of skill distillation, categorizing it based on different\n",
      "types of context and elaborating on how each is distilled\n",
      "and incorporated into smaller, efficient models.\n",
      "4.1.1 Instruction Following\n",
      "Instruction-fol...\n",
      "\n",
      "PROCESSED TEXT:\n",
      "Tao Shen4, Reynold Cheng1, Jinyang Li1,\n",
      "Can Xu5, Dacheng Tao6, Tianyi Zhou2\n",
      "1The University of Hong Kong2University of Maryland3Microsoft\n",
      "4University of Technology Sydney5Peking University6The University of Sydney\n",
      "{shawnxxh,chongyangtao,hishentao }@gmail.com {minglii,tianyi }@umd.edu\n",
      "ckcheng@cs.hku.hk\n",
      "is meticulously structured around three foundational pillars: algorithm, skill, and verticalization – providing a comprehensive examination of knowledge distillation mechanisms, the enhancement of ...\n",
      "==========================================================================================\n",
      "\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Setting `pad_token_id` to `eos_token_id`:None for open-end generation.\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "INPUT TEXT:\n",
      "uctional\n",
      "formats with templates, such as prefacing machine transla-\n",
      "tion data with ”Translate this sentence to Spanish:” . However,\n",
      "these approaches have limitations. Manual data creation is\n",
      "labor-intensive, while template-based transformation lacks\n",
      "diversity in instructions and may not align well with natural\n",
      "human input. LLMs like GPT-4 offer an efficient alternative\n",
      "for creating diverse and controlled SFT data by their capabil-\n",
      "ities of in-context learning and instruction following. Most\n",
      "rele...\n",
      "\n",
      "PROCESSED TEXT:\n",
      "Tao Shen4, Reynold Cheng1, Jinyang Li1,\n",
      "Can Xu5, Dacheng Tao6, Tianyi Zhou2\n",
      "1The University of Hong Kong2University of Maryland3Microsoft\n",
      "4University of Technology Sydney5Peking University6The University of Sydney\n",
      "{shawnxxh,chongyangtao,hishentao }@gmail.com {minglii,tianyi }@umd.edu\n",
      "ckcheng@cs.hku.hk\n",
      "is meticulously structured around three foundational pillars: algorithm, skill, and verticalization – providing a comprehensive examination of knowledge distillation mechanisms, the enhancement of ...\n",
      "==========================================================================================\n",
      "\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Setting `pad_token_id` to `eos_token_id`:None for open-end generation.\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "INPUT TEXT:\n",
      ",\n",
      "ensuring a broad spectrum of general instructions. Addi-\n",
      "tionally, a filtering and post-processing stage is introduced\n",
      "to eliminate redundant or similar instructions. Notably,\n",
      "through training with this enriched dataset, GPT-3 acquires\n",
      "the ability to follow instructions, enabling it to perform\n",
      "comparably to InstructGPT in zero-shot instruction tasks\n",
      "and when provided with expert-written instructions for\n",
      "novel tasks. Based on the self-instruct method, Taori et al.\n",
      "(2023) train an Alpaca model u...\n",
      "\n",
      "PROCESSED TEXT:\n",
      "Tao Shen4, Reynold Cheng1, Jinyang Li1,\n",
      "Can Xu5, Dacheng Tao6, Tianyi Zhou2\n",
      "1The University of Hong Kong2University of Maryland3Microsoft\n",
      "4University of Technology Sydney5Peking University6The University of Sydney\n",
      "{shawnxxh,chongyangtao,hishentao }@gmail.com {minglii,tianyi }@umd.edu\n",
      "ckcheng@cs.hku.hk\n",
      "is meticulously structured around three foundational pillars: algorithm, skill, and verticalization – providing a comprehensive examination of knowledge distillation mechanisms, the enhancement of ...\n",
      "==========================================================================================\n",
      "\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Setting `pad_token_id` to `eos_token_id`:None for open-end generation.\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "INPUT TEXT:\n",
      "ctions (Xu et al., 2023a; Luo et al.,\n",
      "2023b,a; Guo et al., 2023c). According to Xu et al. (2023a), in-\n",
      "struction datasets derived from human-written seeds often\n",
      "exhibit low to moderate complexity. To enhance the com-\n",
      "plex instruction-following capabilities of smaller models,\n",
      "WizardLM (Xu et al., 2023a) introduces Evol-Instruct . This\n",
      "method gradually transforms instructions into more com-\n",
      "plex forms through a multi-step evolution process, focusing\n",
      "on both increasing difficulty levels and expandi...\n",
      "\n",
      "PROCESSED TEXT:\n",
      "Tao Shen4, Reynold Cheng1, Jinyang Li1,\n",
      "Can Xu5, Dacheng Tao6, Tianyi Zhou2\n",
      "1The University of Hong Kong2University of Maryland3Microsoft\n",
      "4University of Technology Sydney5Peking University6The University of Sydney\n",
      "{shawnxxh,chongyangtao,hishentao }@gmail.com {minglii,tianyi }@umd.edu\n",
      "ckcheng@cs.hku.hk\n",
      "is meticulously structured around three foundational pillars: algorithm, skill, and verticalization – providing a comprehensive examination of knowledge distillation mechanisms, the enhancement of ...\n",
      "==========================================================================================\n",
      "\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Setting `pad_token_id` to `eos_token_id`:None for open-end generation.\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "INPUT TEXT:\n",
      "nstruction Fusion (Guo et al., 2023c)\n",
      "further uses teacher LLMs to increase the complexity by\n",
      "fusing two distinct evolved instructions. Furthermore, this\n",
      "concept of “evolving” instructions has been extended to\n",
      "distill specific skills such as coding (Luo et al., 2023a) and\n",
      "mathematics (Luo et al., 2023b).\n",
      "Human Instructions. In contrast to works that rely on gener-\n",
      "ating instructions from ChatGPT, which may lack diversity\n",
      "and have gaps with real human instructions, Vicuna (Chiang\n",
      "et al., 2023) an...\n",
      "\n",
      "PROCESSED TEXT:\n",
      "Tao Shen4, Reynold Cheng1, Jinyang Li1,\n",
      "Can Xu5, Dacheng Tao6, Tianyi Zhou2\n",
      "1The University of Hong Kong2University of Maryland3Microsoft\n",
      "4University of Technology Sydney5Peking University6The University of Sydney\n",
      "{shawnxxh,chongyangtao,hishentao }@gmail.com {minglii,tianyi }@umd.edu\n",
      "ckcheng@cs.hku.hk\n",
      "is meticulously structured around three foundational pillars: algorithm, skill, and verticalization – providing a comprehensive examination of knowledge distillation mechanisms, the enhancement of ...\n",
      "==========================================================================================\n",
      "\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Setting `pad_token_id` to `eos_token_id`:None for open-end generation.\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "INPUT TEXT:\n",
      "t al., 2023).\n",
      "System Instructions. To encourage student models to learn\n",
      "the reasoning process, Orca and Orca 2 (Mukherjee et al.,\n",
      "2023; Mitra et al., 2023) enhance the prompt, response data\n",
      "pairs by introducing a system message (e.g., ”explain like\n",
      "I’m five, think step-by-step”) to encourage student mod-\n",
      "els to grasp the reasoning process. This system message\n",
      "prompts GPT-4 to provide explanation traces that eluci-\n",
      "date the teacher’s reasoning process. Orca 2 (Mitra et al.,\n",
      "2023) further trains t...\n",
      "\n",
      "PROCESSED TEXT:\n",
      "Tao Shen4, Reynold Cheng1, Jinyang Li1,\n",
      "Can Xu5, Dacheng Tao6, Tianyi Zhou2\n",
      "1The University of Hong Kong2University of Maryland3Microsoft\n",
      "4University of Technology Sydney5Peking University6The University of Sydney\n",
      "{shawnxxh,chongyangtao,hishentao }@gmail.com {minglii,tianyi }@umd.edu\n",
      "ckcheng@cs.hku.hk\n",
      "is meticulously structured around three foundational pillars: algorithm, skill, and verticalization – providing a comprehensive examination of knowledge distillation mechanisms, the enhancement of ...\n",
      "==========================================================================================\n",
      "\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Setting `pad_token_id` to `eos_token_id`:None for open-end generation.\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "INPUT TEXT:\n",
      " by various meta-\n",
      "information. The UltraLLaMA model, fine-tuned on this\n",
      "data, consistently surpasses other open-source models. The\n",
      "Phi series models (Gunasekar et al., 2023; Li et al., 2023a;\n",
      "Mar, 2023) prioritize data quality and employ synthetic\n",
      "methods to generate data of “textbook quality” to enhance\n",
      "the learning experience for smaller models. Notably, Phi\n",
      "exhibits the ability to follow instructions effectively even\n",
      "without specific instruction fine-tuning. What’s particularly\n",
      "remarkable is ...\n",
      "\n",
      "PROCESSED TEXT:\n",
      "Tao Shen4, Reynold Cheng1, Jinyang Li1,\n",
      "Can Xu5, Dacheng Tao6, Tianyi Zhou2\n",
      "1The University of Hong Kong2University of Maryland3Microsoft\n",
      "4University of Technology Sydney5Peking University6The University of Sydney\n",
      "{shawnxxh,chongyangtao,hishentao }@gmail.com {minglii,tianyi }@umd.edu\n",
      "ckcheng@cs.hku.hk\n",
      "is meticulously structured around three foundational pillars: algorithm, skill, and verticalization – providing a comprehensive examination of knowledge distillation mechanisms, the enhancement of ...\n",
      "==========================================================================================\n",
      "\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Setting `pad_token_id` to `eos_token_id`:None for open-end generation.\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "INPUT TEXT:\n",
      "ent-\n",
      "16\n",
      "ing vanilla instructions with specialized Expert Identity\n",
      "descriptions. Reflection-Tuning (Li et al., 2023e) improves\n",
      "both the instruction and response sequentially by reflecting\n",
      "on specific criteria. DEITA (Liu et al., 2023h) proposes to\n",
      "enhance and score instructions in three directions includ-\n",
      "ing complexity, quality, and diversity to get high-quality\n",
      "distillation data. MUFFIN (Lou et al., 2023) proposes to\n",
      "scale the instruction according to the input by diversifying\n",
      "these tasks with ...\n",
      "\n",
      "PROCESSED TEXT:\n",
      "Tao Shen4, Reynold Cheng1, Jinyang Li1,\n",
      "Can Xu5, Dacheng Tao6, Tianyi Zhou2\n",
      "1The University of Hong Kong2University of Maryland3Microsoft\n",
      "4University of Technology Sydney5Peking University6The University of Sydney\n",
      "{shawnxxh,chongyangtao,hishentao }@gmail.com {minglii,tianyi }@umd.edu\n",
      "ckcheng@cs.hku.hk\n",
      "is meticulously structured around three foundational pillars: algorithm, skill, and verticalization – providing a comprehensive examination of knowledge distillation mechanisms, the enhancement of ...\n",
      "==========================================================================================\n",
      "\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Setting `pad_token_id` to `eos_token_id`:None for open-end generation.\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "INPUT TEXT:\n",
      "ility, like diver-\n",
      "sity, complexity and explanation. However, student mod-\n",
      "els trained on instruction data expanded by ChatGPT of-\n",
      "ten mimic ChatGPT’s style without replicating its factual\n",
      "accuracy (Gudibande et al., 2023). Achieving a more ca-\n",
      "pable instruction-following capability requires a stronger\n",
      "teacher LLM (Gudibande et al., 2023) and access to di-\n",
      "verse, high-quality instruction data, such as the one used\n",
      "in Orca (Mukherjee et al., 2023; Mitra et al., 2023), which\n",
      "incorporates extensive...\n",
      "\n",
      "PROCESSED TEXT:\n",
      "Tao Shen4, Reynold Cheng1, Jinyang Li1,\n",
      "Can Xu5, Dacheng Tao6, Tianyi Zhou2\n",
      "1The University of Hong Kong2University of Maryland3Microsoft\n",
      "4University of Technology Sydney5Peking University6The University of Sydney\n",
      "{shawnxxh,chongyangtao,hishentao }@gmail.com {minglii,tianyi }@umd.edu\n",
      "ckcheng@cs.hku.hk\n",
      "is meticulously structured around three foundational pillars: algorithm, skill, and verticalization – providing a comprehensive examination of knowledge distillation mechanisms, the enhancement of ...\n",
      "==========================================================================================\n",
      "\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Setting `pad_token_id` to `eos_token_id`:None for open-end generation.\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "INPUT TEXT:\n",
      "knowl-\n",
      "edge from teacher LLMs (Chiang et al., 2023; Xu et al., 2023b;\n",
      "Ding et al., 2023b; Li et al., 2023b; Wang et al., 2023c; Tunstall\n",
      "et al., 2023).\n",
      "ShareGPT serves as a platform for users to share their\n",
      "conversations with ChatGPT, offering a vast repository of\n",
      "multi-turn conversations readily available. Some small chat\n",
      "models are trained using this data to acquire the capability\n",
      "for engaging in multi-turn dialogues (Chiang et al., 2023; Ye\n",
      "et al., 2023; Wang et al., 2023c). For example, Vicu...\n",
      "\n",
      "PROCESSED TEXT:\n",
      "Tao Shen4, Reynold Cheng1, Jinyang Li1,\n",
      "Can Xu5, Dacheng Tao6, Tianyi Zhou2\n",
      "1The University of Hong Kong2University of Maryland3Microsoft\n",
      "4University of Technology Sydney5Peking University6The University of Sydney\n",
      "{shawnxxh,chongyangtao,hishentao }@gmail.com {minglii,tianyi }@umd.edu\n",
      "ckcheng@cs.hku.hk\n",
      "is meticulously structured around three foundational pillars: algorithm, skill, and verticalization – providing a comprehensive examination of knowledge distillation mechanisms, the enhancement of ...\n",
      "==========================================================================================\n",
      "\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Setting `pad_token_id` to `eos_token_id`:None for open-end generation.\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "INPUT TEXT:\n",
      ") enhance the quality of multi-turn\n",
      "data from ShareGPT by generating self-feedback on model\n",
      "responses and iteratively refining the responses based on\n",
      "the received feedback.\n",
      "3. MT-Bench: a multi-turn question set, where the generations of\n",
      "models are evaluated by LLM, like GPT-4.To enhance the multi-turn capabilities of student models,\n",
      "another line of research focuses on expanding conversa-\n",
      "tional datasets through self-chat and using them to train\n",
      "smaller models (Xu et al., 2023b; Ding et al., 202...\n",
      "\n",
      "PROCESSED TEXT:\n",
      "Tao Shen4, Reynold Cheng1, Jinyang Li1,\n",
      "Can Xu5, Dacheng Tao6, Tianyi Zhou2\n",
      "1The University of Hong Kong2University of Maryland3Microsoft\n",
      "4University of Technology Sydney5Peking University6The University of Sydney\n",
      "{shawnxxh,chongyangtao,hishentao }@gmail.com {minglii,tianyi }@umd.edu\n",
      "ckcheng@cs.hku.hk\n",
      "is meticulously structured around three foundational pillars: algorithm, skill, and verticalization – providing a comprehensive examination of knowledge distillation mechanisms, the enhancement of ...\n",
      "==========================================================================================\n",
      "\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Setting `pad_token_id` to `eos_token_id`:None for open-end generation.\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "INPUT TEXT:\n",
      "ogues from ChatGPT. Notably, UltraChat encom-\n",
      "passes a wide range of topics and instructions. Building\n",
      "upon the UltraChat dataset, they fine-tune a LLaMA model,\n",
      "resulting in the creation of a powerful chat model known as\n",
      "UltraLLaMA. UltraLLaMA consistently outperforms other\n",
      "open-source chat models, including Vicuna and Baize. Fur-\n",
      "thermore, UltraChat is employed in conjunction with an\n",
      "AI preference-aligned chat model named Zephyr (Tunstall\n",
      "et al., 2023). Zephyr enhances intent alignment through\n",
      "...\n",
      "\n",
      "PROCESSED TEXT:\n",
      "Tao Shen4, Reynold Cheng1, Jinyang Li1,\n",
      "Can Xu5, Dacheng Tao6, Tianyi Zhou2\n",
      "1The University of Hong Kong2University of Maryland3Microsoft\n",
      "4University of Technology Sydney5Peking University6The University of Sydney\n",
      "{shawnxxh,chongyangtao,hishentao }@gmail.com {minglii,tianyi }@umd.edu\n",
      "ckcheng@cs.hku.hk\n",
      "is meticulously structured around three foundational pillars: algorithm, skill, and verticalization – providing a comprehensive examination of knowledge distillation mechanisms, the enhancement of ...\n",
      "==========================================================================================\n",
      "\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Setting `pad_token_id` to `eos_token_id`:None for open-end generation.\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "INPUT TEXT:\n",
      "ave been proposed (Kang et al., 2023a; Luo\n",
      "et al., 2023c; Asai et al., 2023).\n",
      "SAIL (Luo et al., 2023c) starts by retrieving search results\n",
      "for each training case using search APIs, creating search-\n",
      "augmented instructions that include both the instruction\n",
      "and grounding information. To encourage the language\n",
      "model to prioritize informative retrieval results, they input\n",
      "each retrieved passage along with the ground truth response\n",
      "into the entailment model to label each retrieval result for\n",
      "relevance...\n",
      "\n",
      "PROCESSED TEXT:\n",
      "Tao Shen4, Reynold Cheng1, Jinyang Li1,\n",
      "Can Xu5, Dacheng Tao6, Tianyi Zhou2\n",
      "1The University of Hong Kong2University of Maryland3Microsoft\n",
      "4University of Technology Sydney5Peking University6The University of Sydney\n",
      "{shawnxxh,chongyangtao,hishentao }@gmail.com {minglii,tianyi }@umd.edu\n",
      "ckcheng@cs.hku.hk\n",
      "is meticulously structured around three foundational pillars: algorithm, skill, and verticalization – providing a comprehensive examination of knowledge distillation mechanisms, the enhancement of ...\n",
      "==========================================================================================\n",
      "\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Setting `pad_token_id` to `eos_token_id`:None for open-end generation.\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "INPUT TEXT:\n",
      "ionales serve\n",
      "as a means to retrieve relevant knowledge d, and the student\n",
      "LM is subsequently fine-tuned using the rationales along-\n",
      "side questions and knowledge. However, during inference,\n",
      "only questions are available. To address this, the Reranker\n",
      "is trained to mimic how the retriever scores passages with\n",
      "the rationale by minimizing the KL divergence between\n",
      "Retriever (d|r)andReranker (d|x). However, the integra-\n",
      "tion of a fixed number of passages in language models,\n",
      "without considering their ...\n",
      "\n",
      "PROCESSED TEXT:\n",
      "Tao Shen4, Reynold Cheng1, Jinyang Li1,\n",
      "Can Xu5, Dacheng Tao6, Tianyi Zhou2\n",
      "1The University of Hong Kong2University of Maryland3Microsoft\n",
      "4University of Technology Sydney5Peking University6The University of Sydney\n",
      "{shawnxxh,chongyangtao,hishentao }@gmail.com {minglii,tianyi }@umd.edu\n",
      "ckcheng@cs.hku.hk\n",
      "is meticulously structured around three foundational pillars: algorithm, skill, and verticalization – providing a comprehensive examination of knowledge distillation mechanisms, the enhancement of ...\n",
      "==========================================================================================\n",
      "\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Setting `pad_token_id` to `eos_token_id`:None for open-end generation.\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "INPUT TEXT:\n",
      "n Retrieve . To distill\n",
      "this critic data, GPT-4 is prompted to assess the need for\n",
      "retrieval using few-shot demonstrations I, the task input\n",
      "x, and output yto predict a reflection token ras follows:\n",
      "p(r|I, x, y ).\n",
      "4.2 Alignment\n",
      "4.2.1 Thinking Pattern\n",
      "Most existing methods mainly focus on directly aligning the\n",
      "direct responses of the student models to the responses of\n",
      "teacher models (Taori et al., 2023). Though effective, these\n",
      "models might suffer the problems that they tend to learn to\n",
      "imitate t...\n",
      "\n",
      "PROCESSED TEXT:\n",
      "Tao Shen4, Reynold Cheng1, Jinyang Li1,\n",
      "Can Xu5, Dacheng Tao6, Tianyi Zhou2\n",
      "1The University of Hong Kong2University of Maryland3Microsoft\n",
      "4University of Technology Sydney5Peking University6The University of Sydney\n",
      "{shawnxxh,chongyangtao,hishentao }@gmail.com {minglii,tianyi }@umd.edu\n",
      "ckcheng@cs.hku.hk\n",
      "is meticulously structured around three foundational pillars: algorithm, skill, and verticalization – providing a comprehensive examination of knowledge distillation mechanisms, the enhancement of ...\n",
      "==========================================================================================\n",
      "\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Setting `pad_token_id` to `eos_token_id`:None for open-end generation.\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "INPUT TEXT:\n",
      "2022; Madaan et al., 2023; Saunders\n",
      "et al., 2022), SelFee (Ye et al., 2023) proposes to train a\n",
      "model that has been fine-tuned to continuously revise its\n",
      "own answer until it provides a high-quality response in a\n",
      "single inference. During training, it utilizes both the final\n",
      "response and feedback chain as the fitting target. This pat-\n",
      "tern, response with the revision process, shows a promising\n",
      "performance gain. Following SelFee, Reflection-Tuning (Li\n",
      "et al., 2023e, 2024d) also utilizes the reflect...\n",
      "\n",
      "PROCESSED TEXT:\n",
      "Tao Shen4, Reynold Cheng1, Jinyang Li1,\n",
      "Can Xu5, Dacheng Tao6, Tianyi Zhou2\n",
      "1The University of Hong Kong2University of Maryland3Microsoft\n",
      "4University of Technology Sydney5Peking University6The University of Sydney\n",
      "{shawnxxh,chongyangtao,hishentao }@gmail.com {minglii,tianyi }@umd.edu\n",
      "ckcheng@cs.hku.hk\n",
      "is meticulously structured around three foundational pillars: algorithm, skill, and verticalization – providing a comprehensive examination of knowledge distillation mechanisms, the enhancement of ...\n",
      "==========================================================================================\n",
      "\n",
      "INPUT TEXT:\n",
      "r presents to eq...\n",
      "\n",
      "PROCESSED TEXT:\n",
      "Tao Shen4, Reynold Cheng1, Jinyang Li1,\n",
      "Can Xu5, Dacheng Tao6, Tianyi Zhou2\n",
      "1The University of Hong Kong2University of Maryland3Microsoft\n",
      "4University of Technology Sydney5Peking University6The University of Sydney\n",
      "{shawnxxh,chongyangtao,hishentao }@gmail.com {minglii,tianyi }@umd.edu\n",
      "ckcheng@cs.hku.hk\n",
      "is meticulously structured around three foundational pillars: algorithm, skill, and verticalization – providing a comprehensive examination of knowledge distillation mechanisms, the enhancement of ...\n",
      "==========================================================================================\n",
      "\n"
     ]
    }
   ],
   "source": [
    "with open(INPUT_FILE, 'r', encoding='utf-8') as file:\n",
    "    text = file.read()\n",
    "\n",
    "# Calculate number of chunks\n",
    "num_chunks = (len(text) + CHUNK_SIZE - 1) // CHUNK_SIZE\n",
    "\n",
    "# Cell 6: Process the file\n",
    "# Create output file name\n",
    "output_file = f\"clean_{os.path.basename(INPUT_FILE)}\"\n",
    "\n",
    "# Process chunks and write to file\n",
    "processed_text = \"\"\n",
    "\n",
    "with open(output_file, 'w', encoding='utf-8') as out_file:\n",
    "    for chunk_num in tqdm(range(num_chunks), desc=\"Processing chunks\"):\n",
    "        # Get chunk with overlap\n",
    "        start_idx = chunk_num * CHUNK_SIZE\n",
    "        end_idx = start_idx + CHUNK_SIZE\n",
    "        \n",
    "        chunk = text[start_idx:end_idx]\n",
    "        \n",
    "        # Process chunk and append to complete text\n",
    "        processed_chunk = process_chunk(chunk)\n",
    "        processed_text += processed_chunk + \" \"\n",
    "        \n",
    "        # Write chunk immediately to file\n",
    "        out_file.write(processed_chunk)\n",
    "        \n",
    "        # Force flush the file to disk\n",
    "        out_file.flush()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "a0183c47-339d-4041-ae83-77fc34931075",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.12.5"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}