|
@@ -0,0 +1,248 @@
|
|
|
+{
|
|
|
+ "cells": [
|
|
|
+ {
|
|
|
+ "cell_type": "markdown",
|
|
|
+ "metadata": {
|
|
|
+ "id": "Wt3pZ_J2tKeo"
|
|
|
+ },
|
|
|
+ "source": [
|
|
|
+ "# Tutorial\n",
|
|
|
+ "In this tutorial, we'll break a sample text document into chunks and generate contextual keywords for each one using Llama 3.1.\n",
|
|
|
+ "\n",
|
|
|
+ "Let's start by installing the important packages.\n",
|
|
|
+ "For Llama model inference, we use DeepInfra here, but you can use any inference service provider"
|
|
|
+ ]
|
|
|
+ },
|
|
|
+ {
|
|
|
+ "cell_type": "code",
|
|
|
+ "execution_count": null,
|
|
|
+ "metadata": {
|
|
|
+ "id": "LEz4Dfa9-i-Z"
|
|
|
+ },
|
|
|
+ "outputs": [],
|
|
|
+ "source": [
|
|
|
+ "#Install dependencies\n",
|
|
|
+ "!pip install tiktoken\n",
|
|
|
+ "!pip install openai\n",
|
|
|
+ "\n",
|
|
|
+ "from config import DEEPINFRA_API_KEY"
|
|
|
+ ]
|
|
|
+ },
|
|
|
+ {
|
|
|
+ "cell_type": "markdown",
|
|
|
+ "metadata": {
|
|
|
+ "id": "mwW6s7EJm9ul"
|
|
|
+ },
|
|
|
+ "source": [
|
|
|
+ "First, obtain your document content. For this tutorial, the recommended document size ranges from 2,000 to 20,000 tokens."
|
|
|
+ ]
|
|
|
+ },
|
|
|
+ {
|
|
|
+ "cell_type": "code",
|
|
|
+ "execution_count": null,
|
|
|
+ "metadata": {
|
|
|
+ "id": "_Iooj2f76wXl"
|
|
|
+ },
|
|
|
+ "outputs": [],
|
|
|
+ "source": [
|
|
|
+ "document_content = \"\"\n",
|
|
|
+ "with open('./data/llama_article.txt', 'r') as file:\n",
|
|
|
+ " document_content = file.read()\n"
|
|
|
+ ]
|
|
|
+ },
|
|
|
+ {
|
|
|
+ "cell_type": "markdown",
|
|
|
+ "metadata": {
|
|
|
+ "id": "q4IO9wIip8lV"
|
|
|
+ },
|
|
|
+ "source": [
|
|
|
+ "We will then split the document content into chunks of 300-1000 tokens"
|
|
|
+ ]
|
|
|
+ },
|
|
|
+ {
|
|
|
+ "cell_type": "code",
|
|
|
+ "execution_count": null,
|
|
|
+ "metadata": {
|
|
|
+ "id": "oek9g5Xom2XJ"
|
|
|
+ },
|
|
|
+ "outputs": [],
|
|
|
+ "source": [
|
|
|
+ "#split into chunks (simple way)\n",
|
|
|
+ "def split_into_chunks(content, chunk_size):\n",
|
|
|
+ "\timport tiktoken\n",
|
|
|
+ "\tenc = tiktoken.get_encoding(\"o200k_base\")\n",
|
|
|
+ "\ta = enc.encode(content)\n",
|
|
|
+ "\tleft, chunks = 0, []\n",
|
|
|
+ "\twhile left < len(a):\n",
|
|
|
+ "\t\tarr = a[left : left+chunk_size]\n",
|
|
|
+ "\t\tchunks.append(enc.decode(arr))\n",
|
|
|
+ "\t\tleft+=chunk_size\n",
|
|
|
+ "\treturn chunks\n",
|
|
|
+ "\n",
|
|
|
+ "chunks = split_into_chunks(document_content, 400)\n",
|
|
|
+ "\n",
|
|
|
+ "#generate chunked content\n",
|
|
|
+ "chunked_content = \"\"\n",
|
|
|
+ "for idx, text in enumerate(chunks):\n",
|
|
|
+ " chunked_content+=f\"### Chunk {idx+1} ###\\n{text}\\n\\n\""
|
|
|
+ ]
|
|
|
+ },
|
|
|
+ {
|
|
|
+ "cell_type": "markdown",
|
|
|
+ "metadata": {
|
|
|
+ "id": "J4evTJGL83P-"
|
|
|
+ },
|
|
|
+ "source": [
|
|
|
+ "Now your chunked_content looks like\n",
|
|
|
+ "\n",
|
|
|
+ "```\n",
|
|
|
+ "### Chunk 1 ###\n",
|
|
|
+ "{chunk1}\n",
|
|
|
+ "\n",
|
|
|
+ "### Chunk 2 ###\n",
|
|
|
+ "{chunk2}\n",
|
|
|
+ "\n",
|
|
|
+ "..\n",
|
|
|
+ "```\n"
|
|
|
+ ]
|
|
|
+ },
|
|
|
+ {
|
|
|
+ "cell_type": "markdown",
|
|
|
+ "metadata": {
|
|
|
+ "id": "iTBzN--tmIv-"
|
|
|
+ },
|
|
|
+ "source": [
|
|
|
+ "Next, generate contextual keywords to have better chunk representation for embeddings. Here, we use DeepInfra servers for inference"
|
|
|
+ ]
|
|
|
+ },
|
|
|
+ {
|
|
|
+ "cell_type": "code",
|
|
|
+ "execution_count": null,
|
|
|
+ "metadata": {
|
|
|
+ "id": "2AqW8zgw9Ah2"
|
|
|
+ },
|
|
|
+ "outputs": [],
|
|
|
+ "source": [
|
|
|
+ "from openai import OpenAI\n",
|
|
|
+ "openai = OpenAI(api_key=DEEPINFRA_API_KEY, base_url=\"https://api.deepinfra.com/v1/openai\")\n",
|
|
|
+ "\n",
|
|
|
+ "def deepinfra_run(system_prompt, user_message):\n",
|
|
|
+ "\tchat_completion = openai.chat.completions.create(\n",
|
|
|
+ "\t model=\"meta-llama/Meta-Llama-3.1-405B-Instruct\",\n",
|
|
|
+ "\t messages=[{\"role\": \"system\", \"content\": system_prompt}, {\"role\": \"user\", \"content\": user_message}],\n",
|
|
|
+ "\t max_tokens=4096\n",
|
|
|
+ "\t)\n",
|
|
|
+ "\treturn chat_completion.choices[0].message.content\n",
|
|
|
+ "\n",
|
|
|
+ "system_prompt = '''\n",
|
|
|
+ "Each chunk is separated as ### Chunk [id] ###. For each chunk generate keywords required to fully understand the chunk without any need for looking at the previous chunks.\n",
|
|
|
+ "Don't just say \"List of services\", because its unclear what services are you referring to. Make sure to cover all chunks.\n",
|
|
|
+ "Sample output:\n",
|
|
|
+ "Chunk 1: BMW X5, pricings in France\n",
|
|
|
+ "Chunk 2: BMW X5, discounts\n",
|
|
|
+ "'''\n",
|
|
|
+ "\n",
|
|
|
+ "keywords_st = deepinfra_run(system_prompt, chunked_content)\n",
|
|
|
+ "print(keywords_st)"
|
|
|
+ ]
|
|
|
+ },
|
|
|
+ {
|
|
|
+ "cell_type": "markdown",
|
|
|
+ "metadata": {
|
|
|
+ "id": "ucmJJjvuqpSU"
|
|
|
+ },
|
|
|
+ "source": [
|
|
|
+ "Next, we need to parse the generated keywords into array\n"
|
|
|
+ ]
|
|
|
+ },
|
|
|
+ {
|
|
|
+ "cell_type": "code",
|
|
|
+ "execution_count": null,
|
|
|
+ "metadata": {
|
|
|
+ "colab": {
|
|
|
+ "base_uri": "https://localhost:8080/",
|
|
|
+ "height": 198
|
|
|
+ },
|
|
|
+ "id": "ikTy7nA8AFGz",
|
|
|
+ "outputId": "e8fba18d-a697-472e-b5cd-cb71f23c8a0d"
|
|
|
+ },
|
|
|
+ "outputs": [],
|
|
|
+ "source": [
|
|
|
+ "import re\n",
|
|
|
+ "def parse_keywords(content):\n",
|
|
|
+ " result = []\n",
|
|
|
+ " lines = content.strip().split('\\n')\n",
|
|
|
+ " current_chunk = []\n",
|
|
|
+ " inline_pattern = re.compile(r'^\\s*[^#:]+\\s*:\\s*(.+)$') # Matches lines like \"Chunk1: word1, word2\"\n",
|
|
|
+ " section_pattern = re.compile(r'^###\\s*[^#]+\\s*###$') # Matches lines like \"### Chunk1 ###\"\n",
|
|
|
+ "\n",
|
|
|
+ " for line in lines:\n",
|
|
|
+ " line = line.strip()\n",
|
|
|
+ " if not line: continue\n",
|
|
|
+ " inline_match = inline_pattern.match(line)\n",
|
|
|
+ "\n",
|
|
|
+ " if inline_match:\n",
|
|
|
+ " words_str = inline_match.group(1)\n",
|
|
|
+ " words = [word.strip() for word in words_str.split(',') if word.strip()]\n",
|
|
|
+ " result.append(words)\n",
|
|
|
+ " continue\n",
|
|
|
+ "\n",
|
|
|
+ " if section_pattern.match(line):\n",
|
|
|
+ " if current_chunk:\n",
|
|
|
+ " result.append(current_chunk)\n",
|
|
|
+ " current_chunk = []\n",
|
|
|
+ " continue\n",
|
|
|
+ "\n",
|
|
|
+ " if current_chunk is not None:\n",
|
|
|
+ " words = [word.strip() for word in line.split(',') if word.strip()]\n",
|
|
|
+ " current_chunk.extend(words)\n",
|
|
|
+ "\n",
|
|
|
+ " if current_chunk:\n",
|
|
|
+ " result.append(current_chunk)\n",
|
|
|
+ " return result\n",
|
|
|
+ "\n",
|
|
|
+ "\n",
|
|
|
+ "keywords = parse_keywords(keywords_st)\n",
|
|
|
+ "print(keywords)"
|
|
|
+ ]
|
|
|
+ },
|
|
|
+ {
|
|
|
+ "cell_type": "markdown",
|
|
|
+ "metadata": {
|
|
|
+ "id": "X1myu--0kUi0"
|
|
|
+ },
|
|
|
+ "source": [
|
|
|
+ "Now you can modify the chunks using the generated keywords.\n",
|
|
|
+ "\n",
|
|
|
+ "```\n",
|
|
|
+ "For example,\n",
|
|
|
+ "chunk1 = #{keywords1}\\n{chunk1}\n",
|
|
|
+ "```"
|
|
|
+ ]
|
|
|
+ }
|
|
|
+ ],
|
|
|
+ "metadata": {
|
|
|
+ "colab": {
|
|
|
+ "provenance": []
|
|
|
+ },
|
|
|
+ "kernelspec": {
|
|
|
+ "display_name": "Python 3 (ipykernel)",
|
|
|
+ "language": "python",
|
|
|
+ "name": "python3"
|
|
|
+ },
|
|
|
+ "language_info": {
|
|
|
+ "codemirror_mode": {
|
|
|
+ "name": "ipython",
|
|
|
+ "version": 3
|
|
|
+ },
|
|
|
+ "file_extension": ".py",
|
|
|
+ "mimetype": "text/x-python",
|
|
|
+ "name": "python",
|
|
|
+ "nbconvert_exporter": "python",
|
|
|
+ "pygments_lexer": "ipython3",
|
|
|
+ "version": "3.12.3"
|
|
|
+ }
|
|
|
+ },
|
|
|
+ "nbformat": 4,
|
|
|
+ "nbformat_minor": 4
|
|
|
+}
|