Jelajahi Sumber

Step-1 Notebook

Sanyam Bhutani 11 bulan lalu
induk
melakukan
2c5ad4d55b

+ 445 - 0
recipes/quickstart/NotebookLlama/Step-1 PDF Pre-Processing Logic.ipynb

@@ -0,0 +1,445 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "f68aee84-04e3-4cbc-be78-6de9e06e704f",
+   "metadata": {},
+   "source": [
+    "Notebook for uploading PDF, extracting all Text and Pre-Processing using a 1B or 3B model"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "f4fc7aef-3505-482e-a998-790b8b9d48e4",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Collecting PyPDF2\n",
+      "  Downloading pypdf2-3.0.1-py3-none-any.whl.metadata (6.8 kB)\n",
+      "Downloading pypdf2-3.0.1-py3-none-any.whl (232 kB)\n",
+      "Installing collected packages: PyPDF2\n",
+      "Successfully installed PyPDF2-3.0.1\n"
+     ]
+    }
+   ],
+   "source": [
+    "!pip install PyPDF2"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 14,
+   "id": "60d0061b-8b8c-4353-850f-f19466a0ae2d",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "pdf_path = './2402.13116v3.pdf'\n",
+    "DEFAULT_MODEL = \"meta-llama/Llama-3.2-1B-Instruct\"\n",
+    "#DEFAULT_MODEL = \"meta-llama/Llama-3.2-1B-Instruct\" <- Don't think this would be necessary"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 18,
+   "id": "21029232-ac5f-42ca-b26b-baad5b2f49b7",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Import necessary libraries\n",
+    "import PyPDF2\n",
+    "from typing import Optional\n",
+    "import os\n",
+    "import torch\n",
+    "from accelerate import Accelerator\n",
+    "from transformers import AutoModelForCausalLM, AutoTokenizer\n",
+    "from tqdm.notebook import tqdm\n",
+    "import warnings\n",
+    "\n",
+    "accelerator = Accelerator()\n",
+    "device = accelerator.device\n",
+    "\n",
+    "warnings.filterwarnings('ignore')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "id": "153d9ece-37a4-4fff-a8e8-53f923a2b0a0",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def validate_pdf(file_path: str) -> bool:\n",
+    "    if not os.path.exists(file_path):\n",
+    "        print(f\"Error: File not found at path: {file_path}\")\n",
+    "        return False\n",
+    "    if not file_path.lower().endswith('.pdf'):\n",
+    "        print(\"Error: File is not a PDF\")\n",
+    "        return False\n",
+    "    return True"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "id": "b57c2d64-3d75-4aeb-b4ee-bd1661286b66",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def extract_text_from_pdf(file_path: str, max_chars: int = 100000) -> Optional[str]:\n",
+    "    if not validate_pdf(file_path):\n",
+    "        return None\n",
+    "    \n",
+    "    try:\n",
+    "        with open(file_path, 'rb') as file:\n",
+    "            # Create PDF reader object\n",
+    "            pdf_reader = PyPDF2.PdfReader(file)\n",
+    "            \n",
+    "            # Get total number of pages\n",
+    "            num_pages = len(pdf_reader.pages)\n",
+    "            print(f\"Processing PDF with {num_pages} pages...\")\n",
+    "            \n",
+    "            extracted_text = []\n",
+    "            total_chars = 0\n",
+    "            \n",
+    "            # Iterate through all pages\n",
+    "            for page_num in range(num_pages):\n",
+    "                # Extract text from page\n",
+    "                page = pdf_reader.pages[page_num]\n",
+    "                text = page.extract_text()\n",
+    "                \n",
+    "                # Check if adding this page's text would exceed the limit\n",
+    "                if total_chars + len(text) > max_chars:\n",
+    "                    # Only add text up to the limit\n",
+    "                    remaining_chars = max_chars - total_chars\n",
+    "                    extracted_text.append(text[:remaining_chars])\n",
+    "                    print(f\"Reached {max_chars} character limit at page {page_num + 1}\")\n",
+    "                    break\n",
+    "                \n",
+    "                extracted_text.append(text)\n",
+    "                total_chars += len(text)\n",
+    "                print(f\"Processed page {page_num + 1}/{num_pages}\")\n",
+    "            \n",
+    "            final_text = '\\n'.join(extracted_text)\n",
+    "            print(f\"\\nExtraction complete! Total characters: {len(final_text)}\")\n",
+    "            return final_text\n",
+    "            \n",
+    "    except PyPDF2.PdfReadError:\n",
+    "        print(\"Error: Invalid or corrupted PDF file\")\n",
+    "        return None\n",
+    "    except Exception as e:\n",
+    "        print(f\"An unexpected error occurred: {str(e)}\")\n",
+    "        return None\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "id": "0984bb1e-d52c-4cec-a131-67a48061fabc",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Get PDF metadata\n",
+    "def get_pdf_metadata(file_path: str) -> Optional[dict]:\n",
+    "    if not validate_pdf(file_path):\n",
+    "        return None\n",
+    "    \n",
+    "    try:\n",
+    "        with open(file_path, 'rb') as file:\n",
+    "            pdf_reader = PyPDF2.PdfReader(file)\n",
+    "            metadata = {\n",
+    "                'num_pages': len(pdf_reader.pages),\n",
+    "                'metadata': pdf_reader.metadata\n",
+    "            }\n",
+    "            return metadata\n",
+    "    except Exception as e:\n",
+    "        print(f\"Error extracting metadata: {str(e)}\")\n",
+    "        return None"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "id": "63848943-79cc-4e21-8396-6eab5df493e0",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Extracting metadata...\n",
+      "\n",
+      "PDF Metadata:\n",
+      "Number of pages: 44\n",
+      "Document info:\n",
+      "/Author: \n",
+      "/CreationDate: D:20240311015030Z\n",
+      "/Creator: LaTeX with hyperref\n",
+      "/Keywords: \n",
+      "/ModDate: D:20240311015030Z\n",
+      "/PTEX.Fullbanner: This is pdfTeX, Version 3.141592653-2.6-1.40.25 (TeX Live 2023) kpathsea version 6.3.5\n",
+      "/Producer: pdfTeX-1.40.25\n",
+      "/Subject: \n",
+      "/Title: \n",
+      "/Trapped: /False\n",
+      "\n",
+      "Extracting text...\n",
+      "Processing PDF with 44 pages...\n",
+      "Processed page 1/44\n",
+      "Processed page 2/44\n",
+      "Processed page 3/44\n",
+      "Processed page 4/44\n",
+      "Processed page 5/44\n",
+      "Processed page 6/44\n",
+      "Processed page 7/44\n",
+      "Processed page 8/44\n",
+      "Processed page 9/44\n",
+      "Processed page 10/44\n",
+      "Processed page 11/44\n",
+      "Processed page 12/44\n",
+      "Processed page 13/44\n",
+      "Processed page 14/44\n",
+      "Processed page 15/44\n",
+      "Processed page 16/44\n",
+      "Reached 100000 character limit at page 17\n",
+      "\n",
+      "Extraction complete! Total characters: 100016\n",
+      "\n",
+      "Preview of extracted text (first 500 characters):\n",
+      "--------------------------------------------------\n",
+      "1\n",
+      "A Survey on Knowledge Distillation of Large\n",
+      "Language Models\n",
+      "Xiaohan Xu1, Ming Li2, Chongyang Tao3, Tao Shen4, Reynold Cheng1, Jinyang Li1,\n",
+      "Can Xu5, Dacheng Tao6, Tianyi Zhou2\n",
+      "1The University of Hong Kong2University of Maryland3Microsoft\n",
+      "4University of Technology Sydney5Peking University6The University of Sydney\n",
+      "{shawnxxh,chongyangtao,hishentao }@gmail.com {minglii,tianyi }@umd.edu\n",
+      "ckcheng@cs.hku.hk jl0725@connect.hku.hk\n",
+      "Abstract —In the era of Large Language Models (LLMs), Knowledge Distillati\n",
+      "--------------------------------------------------\n",
+      "\n",
+      "Total characters extracted: 100016\n",
+      "\n",
+      "Extracted text has been saved to extracted_text.txt\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Extract metadata first\n",
+    "print(\"Extracting metadata...\")\n",
+    "metadata = get_pdf_metadata(pdf_path)\n",
+    "if metadata:\n",
+    "    print(\"\\nPDF Metadata:\")\n",
+    "    print(f\"Number of pages: {metadata['num_pages']}\")\n",
+    "    print(\"Document info:\")\n",
+    "    for key, value in metadata['metadata'].items():\n",
+    "        print(f\"{key}: {value}\")\n",
+    "\n",
+    "# Extract text\n",
+    "print(\"\\nExtracting text...\")\n",
+    "extracted_text = extract_text_from_pdf(pdf_path)\n",
+    "\n",
+    "# Display first 500 characters of extracted text as preview\n",
+    "if extracted_text:\n",
+    "    print(\"\\nPreview of extracted text (first 500 characters):\")\n",
+    "    print(\"-\" * 50)\n",
+    "    print(extracted_text[:500])\n",
+    "    print(\"-\" * 50)\n",
+    "    print(f\"\\nTotal characters extracted: {len(extracted_text)}\")\n",
+    "\n",
+    "# Optional: Save the extracted text to a file\n",
+    "if extracted_text:\n",
+    "    output_file = 'extracted_text.txt'\n",
+    "    with open(output_file, 'w', encoding='utf-8') as f:\n",
+    "        f.write(extracted_text)\n",
+    "    print(f\"\\nExtracted text has been saved to {output_file}\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 20,
+   "id": "7c0828a5-964d-475e-b5f5-40a04e287725",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "device = \"cuda\" if torch.cuda.is_available() else \"cpu\"\n",
+    "\n",
+    "SYS_PROMPT = \"\"\"\n",
+    "You are a world class text pre-processor, here is the raw data from a PDF, please parse and return it in a way that is crispy and usable to send to a podcast writer.\n",
+    "\n",
+    "The raw data is messed up with new lines, Latex math and you will see fluff that we can remove completely. Basically take away any details that you think might be useless in a podcast author's transcript.\n",
+    "\n",
+    "Remember, the podcast could be on any topic whatsoever so the issues listed above are not exhaustive\n",
+    "\n",
+    "The goal is to use this in a podcast research transcript so a lot of the emails, citations, and things like that can be removed-please be smart with what you remove and be creative ok?\n",
+    "\n",
+    "Remember DO NOT START SUMMARIZING THIS, YOU ARE ONLY CLEANING UP THE TEXT AND RETURNING AS IS\n",
+    "\n",
+    "Be very smart and aggressive with removing details, you will get a running portion of the text and keep returning the processed text.\n",
+    "\n",
+    "ALWAYS start your response directly with processed text and NO ACKNOWLEDGEMENTS about my questions ok?\n",
+    "Here is the text:\n",
+    "\"\"\""
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 22,
+   "id": "d04a4f07-b0b3-45ca-8f41-a433e1abe050",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "accelerator = Accelerator()\n",
+    "model = AutoModelForCausalLM.from_pretrained(\n",
+    "    DEFAULT_MODEL,\n",
+    "    torch_dtype=torch.bfloat16,\n",
+    "    use_safetensors=True,\n",
+    "    device_map=device,\n",
+    ")\n",
+    "tokenizer = AutoTokenizer.from_pretrained(DEFAULT_MODEL, use_safetensors=True)\n",
+    "model, tokenizer = accelerator.prepare(model, tokenizer)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 23,
+   "id": "bbda5241-e890-4402-87dd-514d6761bb9c",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def process_chunk(text_chunk):\n",
+    "    conversation = [\n",
+    "        {\"role\": \"system\", \"content\": SYS_PROMPT},\n",
+    "        {\"role\": \"user\", \"content\": text_chunk},\n",
+    "    ]\n",
+    "    \n",
+    "    prompt = tokenizer.apply_chat_template(conversation, tokenize=False)\n",
+    "    inputs = tokenizer(prompt, return_tensors=\"pt\").to(device)\n",
+    "    \n",
+    "    with torch.no_grad():  # Add this for efficiency\n",
+    "        output = model.generate(\n",
+    "            **inputs,\n",
+    "            temperature=0.7,\n",
+    "            top_p=0.9,\n",
+    "            max_new_tokens=8126\n",
+    "        )\n",
+    "    \n",
+    "    return tokenizer.decode(output[0], skip_special_tokens=True)[len(prompt):].strip()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 24,
+   "id": "5311a77f-c98c-4009-a982-c4393fd64fa4",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "INPUT_FILE = \"./extracted_text.txt\"  # Replace with your file path\n",
+    "CHUNK_SIZE = 1000"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "46e160e8-552a-43b2-9f9e-a7a52092318f",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "f258416a01df4b3ba899f734a3893ca8",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Processing chunks:   0%|          | 0/101 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Setting `pad_token_id` to `eos_token_id`:None for open-end generation.\n",
+      "Starting from v4.46, the `logits` model output will have the same type as the model (except at train time, where it will always be FP32)\n",
+      "Setting `pad_token_id` to `eos_token_id`:None for open-end generation.\n",
+      "Setting `pad_token_id` to `eos_token_id`:None for open-end generation.\n",
+      "Setting `pad_token_id` to `eos_token_id`:None for open-end generation.\n",
+      "Setting `pad_token_id` to `eos_token_id`:None for open-end generation.\n",
+      "Setting `pad_token_id` to `eos_token_id`:None for open-end generation.\n",
+      "Setting `pad_token_id` to `eos_token_id`:None for open-end generation.\n",
+      "Setting `pad_token_id` to `eos_token_id`:None for open-end generation.\n",
+      "Setting `pad_token_id` to `eos_token_id`:None for open-end generation.\n",
+      "Setting `pad_token_id` to `eos_token_id`:None for open-end generation.\n",
+      "Setting `pad_token_id` to `eos_token_id`:None for open-end generation.\n",
+      "Setting `pad_token_id` to `eos_token_id`:None for open-end generation.\n",
+      "Setting `pad_token_id` to `eos_token_id`:None for open-end generation.\n"
+     ]
+    }
+   ],
+   "source": [
+    "with open(INPUT_FILE, 'r', encoding='utf-8') as file:\n",
+    "    text = file.read()\n",
+    "\n",
+    "# Calculate number of chunks\n",
+    "num_chunks = (len(text) + CHUNK_SIZE - 1) // CHUNK_SIZE\n",
+    "\n",
+    "# Cell 6: Process the file\n",
+    "# Create output file name\n",
+    "output_file = f\"clean_{os.path.basename(INPUT_FILE)}\"\n",
+    "\n",
+    "# Process chunks and write to file\n",
+    "processed_chunks = []\n",
+    "with open(output_file, 'w', encoding='utf-8') as out_file:\n",
+    "    for i in tqdm(range(0, len(text), CHUNK_SIZE), desc=\"Processing chunks\"):\n",
+    "        chunk = text[i:i + CHUNK_SIZE]\n",
+    "        processed = process_chunk(chunk)\n",
+    "        out_file.write(processed + \" \")\n",
+    "        processed_chunks.append(processed)\n",
+    "\n",
+    "# Cell 7: Preview results\n",
+    "# Run this cell to see the first few processed chunks\n",
+    "print(\"First few processed chunks:\")\n",
+    "print(\"\\n\".join(processed_chunks[:3]))\n",
+    "\n",
+    "print(\"\\nOutput file saved as:\", output_file)\n",
+    "print(\"Total chunks processed:\", len(processed_chunks))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "a0183c47-339d-4041-ae83-77fc34931075",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.12.5"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}