{ "nbformat": 4, "nbformat_minor": 0, "metadata": { "colab": { "provenance": [] }, "kernelspec": { "name": "python3", "display_name": "Python 3" }, "language_info": { "name": "python" } }, "cells": [ { "cell_type": "code", "execution_count": null, "metadata": { "id": "4OIZC5EaIYY_" }, "outputs": [], "source": [ "!pip install -Uqqq pip --progress-bar off\n", "!pip install -qqq langchain==0.0.139 --progress-bar off\n", "!pip install -qqq openai==0.27.4 --progress-bar off\n", "!pip install -Uqqq watermark==2.3.1 --progress-bar off\n", "!pip install -Uqqq chromadb==0.3.21 --progress-bar off\n", "!pip install -Uqqq tiktoken==0.3.3 --progress-bar off" ] }, { "cell_type": "code", "source": [ "%load_ext watermark" ], "metadata": { "id": "qQ-YYVMCKbYf" }, "execution_count": 2, "outputs": [] }, { "cell_type": "code", "source": [ "import os\n", "import textwrap\n", "\n", "import chromadb\n", "import langchain\n", "import openai\n", "from langchain.chains import RetrievalQA\n", "from langchain.chat_models import ChatOpenAI\n", "from langchain.document_loaders import WebBaseLoader\n", "from langchain.embeddings import OpenAIEmbeddings\n", "from langchain.indexes import VectorstoreIndexCreator\n", "from langchain.llms import OpenAI\n", "from langchain.prompts import PromptTemplate\n", "from langchain.vectorstores import Chroma" ], "metadata": { "id": "htp4LNX0KOtb" }, "execution_count": 3, "outputs": [] }, { "cell_type": "code", "source": [ "%watermark --iversions -v -m" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "TYgj1UZTKl17", "outputId": "01165430-c363-4e8d-cd6e-690a1e9363cc" }, "execution_count": 4, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "Python implementation: CPython\n", "Python version : 3.9.16\n", "IPython version : 7.34.0\n", "\n", "Compiler : GCC 9.4.0\n", "OS : Linux\n", "Release : 5.10.147+\n", "Machine : x86_64\n", "Processor : x86_64\n", "CPU cores : 2\n", "Architecture: 64bit\n", "\n", "chromadb : 0.3.21\n", "openai : 0.27.4\n", "langchain: 0.0.139\n", "\n" ] } ] }, { "cell_type": "code", "source": [ "def print_response(response: str):\n", " print(\"\\n\".join(textwrap.wrap(response, width=100)))" ], "metadata": { "id": "STDwioZBj_pp" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "os.environ[\"OPENAI_API_KEY\"] = \"YOUR OPENAI KEY\"" ], "metadata": { "id": "p8f4eu1SKp4j" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "model = OpenAI(temperature=0)" ], "metadata": { "id": "BA2qn63lLUc-" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "print(\n", " model(\n", " \"You're Dwight K. Schrute from the Office. Suggest 5 places to visit in Scranton that are connected to the TV show.\"\n", " )\n", ")" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "1-rQ5ETGO0Q6", "outputId": "a1a550a5-15b3-4fb6-ca8f-4247a696e1f1" }, "execution_count": null, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "\n", "\n", "1. The Dunder Mifflin Paper Company - Visit the office building where the show was filmed and take a tour of the set.\n", "\n", "2. Poor Richard's Pub - Enjoy a drink at the bar where the cast often hung out.\n", "\n", "3. Steamtown National Historic Site - Take a ride on the historic train that was featured in the show.\n", "\n", "4. The Scranton Cultural Center - Attend a show at the theater where the cast performed a play in the episode \"The Duel\".\n", "\n", "5. The Mall at Steamtown - Shop at the mall where the cast went on a shopping spree in the episode \"The Coup\".\n" ] } ] }, { "cell_type": "markdown", "source": [ "## Q&A Over a Document" ], "metadata": { "id": "lfA_kFae3ROU" } }, { "cell_type": "code", "source": [ "loader = WebBaseLoader(\n", " \"https://blog.twitter.com/engineering/en_us/topics/open-source/2023/twitter-recommendation-algorithm\"\n", ")" ], "metadata": { "id": "DlJWvzaSVGwk" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "documents = loader.load()\n", "len(documents)" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "0k2sQmZCVKRF", "outputId": "c46048ba-9ae2-4561-81fa-d53e5327ceb7" }, "execution_count": null, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "1" ] }, "metadata": {}, "execution_count": 12 } ] }, { "cell_type": "code", "source": [ "document = documents[0]\n", "document.__dict__.keys()" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "334_cQoO4cZP", "outputId": "9c26612c-94d0-485d-c1a9-a24d89393339" }, "execution_count": null, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "dict_keys(['page_content', 'metadata'])" ] }, "metadata": {}, "execution_count": 13 } ] }, { "cell_type": "code", "source": [ "document.page_content[:100]" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 35 }, "id": "eHOdt3K_4x-E", "outputId": "17c01652-b06d-473e-daff-18b74831c44e" }, "execution_count": null, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "\"\\n\\n\\n\\n\\nTwitter's Recommendation Algorithm\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\nEngineering\\n\\n\\n\\n\\n\\nBac\"" ], "application/vnd.google.colaboratory.intrinsic+json": { "type": "string" } }, "metadata": {}, "execution_count": 14 } ] }, { "cell_type": "code", "source": [ "document.metadata" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "MxnizWE-4uzz", "outputId": "0de4c185-06f6-4e93-c88f-4fc14e3a225a" }, "execution_count": null, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "{'source': 'https://blog.twitter.com/engineering/en_us/topics/open-source/2023/twitter-recommendation-algorithm'}" ] }, "metadata": {}, "execution_count": 15 } ] }, { "cell_type": "code", "source": [ "index = VectorstoreIndexCreator().from_loaders([loader])" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "dNk8kGhDa3GR", "outputId": "0f14832b-ec0e-4263-96e1-752fabf757c2" }, "execution_count": null, "outputs": [ { "output_type": "stream", "name": "stderr", "text": [ "WARNING:chromadb:Using embedded DuckDB without persistence: data will be transient\n" ] } ] }, { "cell_type": "code", "source": [ "query = \"\"\"\n", "You're Dwight K. Schrute from the Office.\n", "Explain the Twitter recommendation algorithm in 5 sentences using analogies from the Office.\n", "\"\"\"\n", "print_response(index.query(query))" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "b-9FzImFchcH", "outputId": "064fa564-1d4c-4e4f-d589-6f86d1de61aa" }, "execution_count": null, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ " The Twitter recommendation algorithm is like Dwight K. Schrute's job at Dunder Mifflin. It takes\n", "the 500 million Tweets posted daily and distills them down to a handful of top Tweets that show up\n", "on your timeline, just like Dwight distills the vast amount of paper at Dunder Mifflin into a few\n", "important documents. The algorithm uses a set of core models and features to extract latent\n", "information from Tweet, user, and engagement data, just like Dwight uses his keen eye to spot the\n", "important details in the documents. It then uses a logistic regression model to rank the Tweets,\n", "similar to how Dwight ranks the documents in order of importance. Finally, it traverses the graph of\n", "engagements and follows to answer questions about what Tweets and Users are similar to your\n", "interests, just like Dwight uses his knowledge of the office to answer questions about the people\n", "and documents in the office.\n" ] } ] }, { "cell_type": "markdown", "source": [ "### Using a Prompt Template" ], "metadata": { "id": "8sZR4jBD3eGs" } }, { "cell_type": "code", "source": [ "template = \"\"\"You're Dwight K. Schrute from the Office.\n", "\n", "{context}\n", "\n", "Answer with analogies from the Office to the question and the way Dwight speaks.\n", "\n", "Question: {question}\n", "Answer:\"\"\"\n", "\n", "prompt = PromptTemplate(template=template, input_variables=[\"context\", \"question\"])\n", "print(\n", " prompt.format(\n", " context=\"Paper sells are declining 10% year over year.\",\n", " question=\"How to sell paper?\",\n", " )\n", ")" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "oRptJsAvPNbF", "outputId": "7e632e2b-5a8c-46c2-cf36-59fba0a45cea" }, "execution_count": 10, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "You're Dwight K. Schrute from the Office.\n", "\n", "Paper sells are declining 10% year over year.\n", "\n", "Answer with analogies from the Office to the question and the way Dwight speaks.\n", "\n", "Question: How to sell paper?\n", "Answer:\n" ] } ] }, { "cell_type": "code", "source": [ "embeddings = OpenAIEmbeddings()\n", "db = Chroma.from_documents(documents, embeddings)" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "rXFpoKGbkvSn", "outputId": "9e0c4f83-44c3-4e5c-c35b-a3a2a45738d2" }, "execution_count": null, "outputs": [ { "output_type": "stream", "name": "stderr", "text": [ "WARNING:chromadb:Using embedded DuckDB without persistence: data will be transient\n" ] } ] }, { "cell_type": "code", "source": [ "chain_type_kwargs = {\"prompt\": prompt}\n", "chain = RetrievalQA.from_chain_type(\n", " llm=ChatOpenAI(temperature=0),\n", " chain_type=\"stuff\",\n", " retriever=db.as_retriever(search_kwargs={\"k\": 1}),\n", " chain_type_kwargs=chain_type_kwargs,\n", ")" ], "metadata": { "id": "3TjhORB7xEUp" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "query = \"Explain the Twitter recommendation algorithm in 5 sentences\"\n", "response = chain.run(query)" ], "metadata": { "id": "sfbQY893xbdY" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "print_response(response)" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "rT_8hNbP6mN1", "outputId": "2be2e3f0-8a31-4565-869d-4eb668b3a2bb" }, "execution_count": null, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "Well, Twitter's got this fancy algorithm that picks out the best tweets from the millions of tweets\n", "posted every day. It's like Michael Scott trying to pick the best Dundie award winners from all the\n", "employees. They use a bunch of models and features to figure out what you might like, like how\n", "Dwight uses his knowledge of his coworkers to predict their behavior. Then they rank the tweets\n", "using a big neural network, kind of like how Jim ranks his pranks on Dwight. Finally, they filter\n", "out any tweets you don't want to see, like how Angela filters out any fun from the office. And\n", "voila, you've got your personalized Twitter timeline.\n" ] } ] }, { "cell_type": "markdown", "source": [ "## References\n", "\n", "- [Twitter's Recommendation Algorithm](https://blog.twitter.com/engineering/en_us/topics/open-source/2023/twitter-recommendation-algorithm)" ], "metadata": { "id": "FpikEA-s8RMw" } } ] }