1 년 전 · 9fd54a395c
--- a/.github/scripts/spellcheck_conf/wordlist.txt
+++ b/.github/scripts/spellcheck_conf/wordlist.txt
@@ -1254,9 +1254,6 @@ subtasks
 
				 EleutherAI
			
 
				 CodeLlama
			
 
				 LlamaGuard
			
 
				-OctoAI
			
 
				-octoai
			
 
				-OctoAI's
			
 
				 PurpleLlama
			
 
				 Youtube
			
 
				 wandb
			
--- a/3p-integrations/octoai/MediaGen.ipynb
+++ b/3p-integrations/octoai/MediaGen.ipynb
--- a/3p-integrations/octoai/RAG_chatbot_example/RAG_chatbot_example.ipynb
+++ b/3p-integrations/octoai/RAG_chatbot_example/RAG_chatbot_example.ipynb
--- a/3p-integrations/octoai/RAG_chatbot_example/data/Llama
+++ b/3p-integrations/octoai/RAG_chatbot_example/data/Llama
--- a/3p-integrations/octoai/RAG_chatbot_example/requirements.txt
+++ b/3p-integrations/octoai/RAG_chatbot_example/requirements.txt
@@ -1,7 +0,0 @@
 
				-gradio==4.19.2
			
 
				-pypdf==4.0.0
			
 
				-langchain==0.1.19
			
 
				-sentence-transformers==2.2.2
			
 
				-faiss-cpu==1.7.4
			
 
				-text-generation==0.6.1
			
 
				-octoai-sdk==0.10.1
			
--- a/3p-integrations/octoai/RAG_chatbot_example/vectorstore/db_faiss/index.faiss
+++ b/3p-integrations/octoai/RAG_chatbot_example/vectorstore/db_faiss/index.faiss
--- a/3p-integrations/octoai/RAG_chatbot_example/vectorstore/db_faiss/index.pkl
+++ b/3p-integrations/octoai/RAG_chatbot_example/vectorstore/db_faiss/index.pkl
--- a/3p-integrations/octoai/getting_to_know_llama.ipynb
+++ b/3p-integrations/octoai/getting_to_know_llama.ipynb
--- a/3p-integrations/octoai/hello_llama_cloud.ipynb
+++ b/3p-integrations/octoai/hello_llama_cloud.ipynb
@@ -1,438 +0,0 @@
 
				-{
			
 
				- "cells": [
			
 
				-  {
			
 
				-   "cell_type": "markdown",
			
 
				-   "id": "1c1ea03a-cc69-45b0-80d3-664e48ca6831",
			
 
				-   "metadata": {},
			
 
				-   "source": [
			
 
				-    "## This demo app shows:\n",
			
 
				-    "* How to run Llama 3 in the cloud hosted on OctoAI\n",
			
 
				-    "* How to use LangChain to ask Llama general questions and follow up questions\n",
			
 
				-    "* How to use LangChain to load a recent PDF doc - the Llama paper pdf - and chat about it. This is the well known RAG (Retrieval Augmented Generation) method to let LLM such as Llama be able to answer questions about your own data. RAG is one way to prevent LLM's hallucination\n",
			
 
				-    "\n",
			
 
				-    "**Note** We will be using OctoAI to run the examples here. You will need to first sign into [OctoAI](https://octoai.cloud/) with your Github or Google account, then create a free API token [here](https://octo.ai/docs/getting-started/how-to-create-an-octoai-access-token) that you can use for a while (a month or $10 in OctoAI credits, whichever one runs out first).\n",
			
 
				-    "After the free trial ends, you will need to enter billing info to continue to use Llama 3 hosted on OctoAI."
			
 
				-   ]
			
 
				-  },
			
 
				-  {
			
 
				-   "cell_type": "markdown",
			
 
				-   "id": "61dde626",
			
 
				-   "metadata": {},
			
 
				-   "source": [
			
 
				-    "Let's start by installing the necessary packages:\n",
			
 
				-    "- sentence-transformers for text embeddings\n",
			
 
				-    "- chromadb gives us database capabilities\n",
			
 
				-    "- langchain provides necessary RAG tools for this demo\n",
			
 
				-    "\n",
			
 
				-    "And setting up the OctoAI token."
			
 
				-   ]
			
 
				-  },
			
 
				-  {
			
 
				-   "cell_type": "code",
			
 
				-   "execution_count": null,
			
 
				-   "id": "2c608df5",
			
 
				-   "metadata": {},
			
 
				-   "outputs": [],
			
 
				-   "source": [
			
 
				-    "%pip install langchain==0.1.19 octoai-sdk==0.10.1 openai sentence-transformers chromadb pypdf"
			
 
				-   ]
			
 
				-  },
			
 
				-  {
			
 
				-   "cell_type": "code",
			
 
				-   "execution_count": null,
			
 
				-   "id": "b9c5546a",
			
 
				-   "metadata": {},
			
 
				-   "outputs": [],
			
 
				-   "source": [
			
 
				-    "from getpass import getpass\n",
			
 
				-    "import os\n",
			
 
				-    "\n",
			
 
				-    "OCTOAI_API_TOKEN = getpass()\n",
			
 
				-    "os.environ[\"OCTOAI_API_TOKEN\"] = OCTOAI_API_TOKEN"
			
 
				-   ]
			
 
				-  },
			
 
				-  {
			
 
				-   "cell_type": "markdown",
			
 
				-   "id": "3e8870c1",
			
 
				-   "metadata": {},
			
 
				-   "source": [
			
 
				-    "Next we call the Llama 3 model from OctoAI. In this example we will use the Llama 3 8b instruct model. You can find more on Llama models on the [OctoAI text generation solution page](https://octoai.cloud/text).\n",
			
 
				-    "\n",
			
 
				-    "At the time of writing this notebook the following Llama models are available on OctoAI:\n",
			
 
				-    "* meta-llama-3-8b-instruct\n",
			
 
				-    "* meta-llama-3-70b-instruct\n",
			
 
				-    "* codellama-7b-instruct\n",
			
 
				-    "* codellama-13b-instruct\n",
			
 
				-    "* codellama-34b-instruct\n",
			
 
				-    "* llama-2-13b-chat\n",
			
 
				-    "* llama-2-70b-chat\n",
			
 
				-    "* llamaguard-7b"
			
 
				-   ]
			
 
				-  },
			
 
				-  {
			
 
				-   "cell_type": "code",
			
 
				-   "execution_count": null,
			
 
				-   "id": "ad536adb",
			
 
				-   "metadata": {},
			
 
				-   "outputs": [],
			
 
				-   "source": [
			
 
				-    "from langchain.llms.octoai_endpoint import OctoAIEndpoint\n",
			
 
				-    "\n",
			
 
				-    "llama3_8b = \"meta-llama-3-8b-instruct\"\n",
			
 
				-    "llm = OctoAIEndpoint(\n",
			
 
				-    "    model=llama3_8b,\n",
			
 
				-    "    max_tokens=500,\n",
			
 
				-    "    temperature=0.01\n",
			
 
				-    ")"
			
 
				-   ]
			
 
				-  },
			
 
				-  {
			
 
				-   "cell_type": "markdown",
			
 
				-   "id": "fd207c80",
			
 
				-   "metadata": {},
			
 
				-   "source": [
			
 
				-    "With the model set up, you are now ready to ask some questions. Here is an example of the simplest way to ask the model some general questions."
			
 
				-   ]
			
 
				-  },
			
 
				-  {
			
 
				-   "cell_type": "code",
			
 
				-   "execution_count": null,
			
 
				-   "id": "493a7148",
			
 
				-   "metadata": {},
			
 
				-   "outputs": [],
			
 
				-   "source": [
			
 
				-    "question = \"who wrote the book Innovator's dilemma?\"\n",
			
 
				-    "answer = llm.invoke(question)\n",
			
 
				-    "print(answer)"
			
 
				-   ]
			
 
				-  },
			
 
				-  {
			
 
				-   "cell_type": "markdown",
			
 
				-   "id": "f315f000",
			
 
				-   "metadata": {},
			
 
				-   "source": [
			
 
				-    "We will then try to follow up the response with a question asking for more information on the book. \n",
			
 
				-    "\n",
			
 
				-    "Since the chat history is not passed on Llama doesn't have the context and doesn't know this is more about the book thus it treats this as new query.\n"
			
 
				-   ]
			
 
				-  },
			
 
				-  {
			
 
				-   "cell_type": "code",
			
 
				-   "execution_count": null,
			
 
				-   "id": "9b5c8676",
			
 
				-   "metadata": {},
			
 
				-   "outputs": [],
			
 
				-   "source": [
			
 
				-    "# chat history not passed so Llama doesn't have the context and doesn't know this is more about the book\n",
			
 
				-    "followup = \"tell me more\"\n",
			
 
				-    "followup_answer = llm.invoke(followup)\n",
			
 
				-    "print(followup_answer)"
			
 
				-   ]
			
 
				-  },
			
 
				-  {
			
 
				-   "cell_type": "markdown",
			
 
				-   "id": "9aeaffc7",
			
 
				-   "metadata": {},
			
 
				-   "source": [
			
 
				-    "To get around this we will need to provide the model with history of the chat. \n",
			
 
				-    "\n",
			
 
				-    "To do this, we will use  [`ConversationBufferMemory`](https://python.langchain.com/docs/modules/memory/types/buffer) to pass the chat history to the model and give it the capability to handle follow up questions."
			
 
				-   ]
			
 
				-  },
			
 
				-  {
			
 
				-   "cell_type": "code",
			
 
				-   "execution_count": null,
			
 
				-   "id": "5428ca27",
			
 
				-   "metadata": {},
			
 
				-   "outputs": [],
			
 
				-   "source": [
			
 
				-    "# using ConversationBufferMemory to pass memory (chat history) for follow up questions\n",
			
 
				-    "from langchain.chains import ConversationChain\n",
			
 
				-    "from langchain.memory import ConversationBufferMemory\n",
			
 
				-    "\n",
			
 
				-    "memory = ConversationBufferMemory()\n",
			
 
				-    "conversation = ConversationChain(\n",
			
 
				-    "    llm=llm, \n",
			
 
				-    "    memory=memory,\n",
			
 
				-    "    verbose=False\n",
			
 
				-    ")"
			
 
				-   ]
			
 
				-  },
			
 
				-  {
			
 
				-   "cell_type": "markdown",
			
 
				-   "id": "a3e9af5f",
			
 
				-   "metadata": {},
			
 
				-   "source": [
			
 
				-    "Once this is set up, let us repeat the steps from before and ask the model a simple question.\n",
			
 
				-    "\n",
			
 
				-    "Then we pass the question and answer back into the model for context along with the follow up question."
			
 
				-   ]
			
 
				-  },
			
 
				-  {
			
 
				-   "cell_type": "code",
			
 
				-   "execution_count": null,
			
 
				-   "id": "baee2d22",
			
 
				-   "metadata": {},
			
 
				-   "outputs": [],
			
 
				-   "source": [
			
 
				-    "# restart from the original question\n",
			
 
				-    "answer = conversation.predict(input=question)\n",
			
 
				-    "print(answer)"
			
 
				-   ]
			
 
				-  },
			
 
				-  {
			
 
				-   "cell_type": "code",
			
 
				-   "execution_count": null,
			
 
				-   "id": "9c7d67a8",
			
 
				-   "metadata": {},
			
 
				-   "outputs": [],
			
 
				-   "source": [
			
 
				-    "# pass context (previous question and answer) along with the follow up \"tell me more\" to Llama who now knows more of what\n",
			
 
				-    "memory.save_context({\"input\": question},\n",
			
 
				-    "                    {\"output\": answer})\n",
			
 
				-    "followup_answer = conversation.predict(input=followup)\n",
			
 
				-    "print(followup_answer)"
			
 
				-   ]
			
 
				-  },
			
 
				-  {
			
 
				-   "cell_type": "markdown",
			
 
				-   "id": "fc436163",
			
 
				-   "metadata": {},
			
 
				-   "source": [
			
 
				-    "Next, let's explore using Llama 3 to answer questions using documents for context. \n",
			
 
				-    "This gives us the ability to update Llama 3's knowledge thus giving it better context without needing to finetune. \n",
			
 
				-    "\n",
			
 
				-    "We will use the PyPDFLoader to load in a pdf, in this case, the Llama paper."
			
 
				-   ]
			
 
				-  },
			
 
				-  {
			
 
				-   "cell_type": "code",
			
 
				-   "execution_count": null,
			
 
				-   "id": "f5303d75",
			
 
				-   "metadata": {},
			
 
				-   "outputs": [],
			
 
				-   "source": [
			
 
				-    "from langchain.document_loaders import PyPDFLoader\n",
			
 
				-    "loader = PyPDFLoader(\"https://arxiv.org/pdf/2307.09288.pdf\")\n",
			
 
				-    "docs = loader.load()"
			
 
				-   ]
			
 
				-  },
			
 
				-  {
			
 
				-   "cell_type": "code",
			
 
				-   "execution_count": null,
			
 
				-   "id": "678c2b4a",
			
 
				-   "metadata": {},
			
 
				-   "outputs": [],
			
 
				-   "source": [
			
 
				-    "# check docs length and content\n",
			
 
				-    "print(len(docs), docs[0].page_content[0:300])"
			
 
				-   ]
			
 
				-  },
			
 
				-  {
			
 
				-   "cell_type": "markdown",
			
 
				-   "id": "73b8268e",
			
 
				-   "metadata": {},
			
 
				-   "source": [
			
 
				-    "We need to store our documents. There are more than 30 vector stores (DBs) supported by LangChain.\n",
			
 
				-    "For this example we will use [Chroma](https://python.langchain.com/docs/integrations/vectorstores/chroma) which is light-weight and in memory so it's easy to get started with.\n",
			
 
				-    "For other vector stores especially if you need to store a large amount of data - see https://python.langchain.com/docs/integrations/vectorstores\n",
			
 
				-    "\n",
			
 
				-    "We will also import the OctoAIEmbeddings and RecursiveCharacterTextSplitter to assist in storing the documents."
			
 
				-   ]
			
 
				-  },
			
 
				-  {
			
 
				-   "cell_type": "code",
			
 
				-   "execution_count": null,
			
 
				-   "id": "eecb6a34",
			
 
				-   "metadata": {},
			
 
				-   "outputs": [],
			
 
				-   "source": [
			
 
				-    "from langchain.vectorstores import Chroma\n",
			
 
				-    "\n",
			
 
				-    "# embeddings are numerical representations of the question and answer text\n",
			
 
				-    "from langchain_community.embeddings import OctoAIEmbeddings\n",
			
 
				-    "\n",
			
 
				-    "# use a common text splitter to split text into chunks\n",
			
 
				-    "from langchain.text_splitter import RecursiveCharacterTextSplitter"
			
 
				-   ]
			
 
				-  },
			
 
				-  {
			
 
				-   "cell_type": "markdown",
			
 
				-   "id": "36d4a17c",
			
 
				-   "metadata": {},
			
 
				-   "source": [
			
 
				-    "To store the documents, we will need to split them into chunks using [`RecursiveCharacterTextSplitter`](https://python.langchain.com/docs/modules/data_connection/document_transformers/text_splitters/recursive_text_splitter) and create vector representations of these chunks using [`OctoAIEmbeddings`](https://octoai.cloud/tools/text/embeddings?mode=api&model=thenlper%2Fgte-large) on them before storing them into our vector database.\n",
			
 
				-    "\n",
			
 
				-    "In general, you should use larger chuck sizes for highly structured text such as code and smaller size for less structured text. You may need to experiment with different chunk sizes and overlap values to find out the best numbers."
			
 
				-   ]
			
 
				-  },
			
 
				-  {
			
 
				-   "cell_type": "code",
			
 
				-   "execution_count": null,
			
 
				-   "id": "bc65e161",
			
 
				-   "metadata": {},
			
 
				-   "outputs": [],
			
 
				-   "source": [
			
 
				-    "text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=20)\n",
			
 
				-    "all_splits = text_splitter.split_documents(docs)\n",
			
 
				-    "\n",
			
 
				-    "# create the vector db to store all the split chunks as embeddings\n",
			
 
				-    "embeddings = OctoAIEmbeddings(\n",
			
 
				-    "    endpoint_url=\"https://text.octoai.run/v1/embeddings\"\n",
			
 
				-    ")\n",
			
 
				-    "vectordb = Chroma.from_documents(\n",
			
 
				-    "    documents=all_splits,\n",
			
 
				-    "    embedding=embeddings,\n",
			
 
				-    ")"
			
 
				-   ]
			
 
				-  },
			
 
				-  {
			
 
				-   "cell_type": "markdown",
			
 
				-   "id": "54ad02d7",
			
 
				-   "metadata": {},
			
 
				-   "source": [
			
 
				-    "We then use ` RetrievalQA` to retrieve the documents from the vector database and give the model more context on Llama, thereby increasing its knowledge.\n",
			
 
				-    "\n",
			
 
				-    "For each question, LangChain performs a semantic similarity search of it in the vector db, then passes the search results as the context to Llama to answer the question."
			
 
				-   ]
			
 
				-  },
			
 
				-  {
			
 
				-   "cell_type": "code",
			
 
				-   "execution_count": null,
			
 
				-   "id": "00e3f72b",
			
 
				-   "metadata": {},
			
 
				-   "outputs": [],
			
 
				-   "source": [
			
 
				-    "# use LangChain's RetrievalQA, to associate Llama with the loaded documents stored in the vector db\n",
			
 
				-    "from langchain.chains import RetrievalQA\n",
			
 
				-    "\n",
			
 
				-    "qa_chain = RetrievalQA.from_chain_type(\n",
			
 
				-    "    llm,\n",
			
 
				-    "    retriever=vectordb.as_retriever()\n",
			
 
				-    ")\n",
			
 
				-    "\n",
			
 
				-    "question = \"What is llama?\"\n",
			
 
				-    "result = qa_chain({\"query\": question})\n",
			
 
				-    "print(result['result'])"
			
 
				-   ]
			
 
				-  },
			
 
				-  {
			
 
				-   "cell_type": "markdown",
			
 
				-   "id": "7e63769a",
			
 
				-   "metadata": {},
			
 
				-   "source": [
			
 
				-    "Now, lets bring it all together by incorporating follow up questions.\n",
			
 
				-    "\n",
			
 
				-    "First we ask a follow up questions without giving the model context of the previous conversation.\n",
			
 
				-    "Without this context, the answer we get does not relate to our original question."
			
 
				-   ]
			
 
				-  },
			
 
				-  {
			
 
				-   "cell_type": "code",
			
 
				-   "execution_count": null,
			
 
				-   "id": "53f27473",
			
 
				-   "metadata": {},
			
 
				-   "outputs": [],
			
 
				-   "source": [
			
 
				-    "# no context passed so Llama doesn't have enough context to answer so it lets its imagination go wild\n",
			
 
				-    "result = qa_chain({\"query\": \"what are its use cases?\"})\n",
			
 
				-    "print(result['result'])"
			
 
				-   ]
			
 
				-  },
			
 
				-  {
			
 
				-   "cell_type": "markdown",
			
 
				-   "id": "833221c0",
			
 
				-   "metadata": {},
			
 
				-   "source": [
			
 
				-    "As we did before, let us use the `ConversationalRetrievalChain` package to give the model context of our previous question so we can add follow up questions."
			
 
				-   ]
			
 
				-  },
			
 
				-  {
			
 
				-   "cell_type": "code",
			
 
				-   "execution_count": null,
			
 
				-   "id": "743644a1",
			
 
				-   "metadata": {},
			
 
				-   "outputs": [],
			
 
				-   "source": [
			
 
				-    "# use ConversationalRetrievalChain to pass chat history for follow up questions\n",
			
 
				-    "from langchain.chains import ConversationalRetrievalChain\n",
			
 
				-    "chat_chain = ConversationalRetrievalChain.from_llm(llm, vectordb.as_retriever(), return_source_documents=True)"
			
 
				-   ]
			
 
				-  },
			
 
				-  {
			
 
				-   "cell_type": "code",
			
 
				-   "execution_count": null,
			
 
				-   "id": "7c3d1142",
			
 
				-   "metadata": {},
			
 
				-   "outputs": [],
			
 
				-   "source": [
			
 
				-    "# let's ask the original question \"What is llama?\" again\n",
			
 
				-    "result = chat_chain({\"question\": question, \"chat_history\": []})\n",
			
 
				-    "print(result['answer'])"
			
 
				-   ]
			
 
				-  },
			
 
				-  {
			
 
				-   "cell_type": "code",
			
 
				-   "execution_count": null,
			
 
				-   "id": "4b17f08f",
			
 
				-   "metadata": {},
			
 
				-   "outputs": [],
			
 
				-   "source": [
			
 
				-    "# this time we pass chat history along with the follow up so good things should happen\n",
			
 
				-    "chat_history = [(question, result[\"answer\"])]\n",
			
 
				-    "followup = \"what are its use cases?\"\n",
			
 
				-    "followup_answer = chat_chain({\"question\": followup, \"chat_history\": chat_history})\n",
			
 
				-    "print(followup_answer['answer'])"
			
 
				-   ]
			
 
				-  },
			
 
				-  {
			
 
				-   "cell_type": "markdown",
			
 
				-   "id": "04f4eabf",
			
 
				-   "metadata": {},
			
 
				-   "source": [
			
 
				-    "Further follow ups can be made possible by updating chat_history.\n",
			
 
				-    "\n",
			
 
				-    "Note that results can get cut off. You may set \"max_new_tokens\" in the OctoAIEndpoint call above to a larger number (like shown below) to avoid the cut off.\n",
			
 
				-    "\n",
			
 
				-    "```python\n",
			
 
				-    "model_kwargs={\"temperature\": 0.01, \"top_p\": 1, \"max_new_tokens\": 1000}\n",
			
 
				-    "```"
			
 
				-   ]
			
 
				-  },
			
 
				-  {
			
 
				-   "cell_type": "code",
			
 
				-   "execution_count": null,
			
 
				-   "id": "95d22347",
			
 
				-   "metadata": {},
			
 
				-   "outputs": [],
			
 
				-   "source": [
			
 
				-    "# further follow ups can be made possible by updating chat_history like this:\n",
			
 
				-    "chat_history.append((followup, followup_answer[\"answer\"]))\n",
			
 
				-    "more_followup = \"what tasks can it assist with?\"\n",
			
 
				-    "more_followup_answer = chat_chain({\"question\": more_followup, \"chat_history\": chat_history})\n",
			
 
				-    "print(more_followup_answer['answer'])"
			
 
				-   ]
			
 
				-  }
			
 
				- ],
			
 
				- "metadata": {
			
 
				-  "kernelspec": {
			
 
				-   "display_name": "Python 3 (ipykernel)",
			
 
				-   "language": "python",
			
 
				-   "name": "python3"
			
 
				-  },
			
 
				-  "language_info": {
			
 
				-   "codemirror_mode": {
			
 
				-    "name": "ipython",
			
 
				-    "version": 3
			
 
				-   },
			
 
				-   "file_extension": ".py",
			
 
				-   "mimetype": "text/x-python",
			
 
				-   "name": "python",
			
 
				-   "nbconvert_exporter": "python",
			
 
				-   "pygments_lexer": "ipython3",
			
 
				-   "version": "3.11.6"
			
 
				-  }
			
 
				- },
			
 
				- "nbformat": 4,
			
 
				- "nbformat_minor": 5
			
 
				-}
			
--- a/3p-integrations/octoai/live_data.ipynb
+++ b/3p-integrations/octoai/live_data.ipynb
@@ -1,247 +0,0 @@
 
				-{
			
 
				- "cells": [
			
 
				-  {
			
 
				-   "cell_type": "markdown",
			
 
				-   "id": "30eb1704-8d76-4bc9-9308-93243aeb69cb",
			
 
				-   "metadata": {},
			
 
				-   "source": [
			
 
				-    "## This demo app shows:\n",
			
 
				-    "* How to use LlamaIndex, an open source library to help you build custom data augmented LLM applications\n",
			
 
				-    "* How to ask Llama 3 questions about recent live data via the Tavily live search API\n",
			
 
				-    "\n",
			
 
				-    "The LangChain package is used to facilitate the call to Llama 3 hosted on OctoAI\n",
			
 
				-    "\n",
			
 
				-    "**Note** We will be using OctoAI to run the examples here. You will need to first sign into [OctoAI](https://octoai.cloud/) with your Github or Google account, then create a free API token [here](https://octo.ai/docs/getting-started/how-to-create-an-octoai-access-token) that you can use for a while (a month or $10 in OctoAI credits, whichever one runs out first).\n",
			
 
				-    "After the free trial ends, you will need to enter billing info to continue to use Llama3 hosted on OctoAI."
			
 
				-   ]
			
 
				-  },
			
 
				-  {
			
 
				-   "cell_type": "markdown",
			
 
				-   "id": "68cf076e",
			
 
				-   "metadata": {},
			
 
				-   "source": [
			
 
				-    "We start by installing the necessary packages:\n",
			
 
				-    "- [langchain](https://python.langchain.com/docs/get_started/introduction) which provides RAG capabilities\n",
			
 
				-    "- [llama-index](https://docs.llamaindex.ai/en/stable/) for data augmentation."
			
 
				-   ]
			
 
				-  },
			
 
				-  {
			
 
				-   "cell_type": "code",
			
 
				-   "execution_count": null,
			
 
				-   "id": "1d0005d6-e928-4d1a-981b-534a40e19e56",
			
 
				-   "metadata": {},
			
 
				-   "outputs": [],
			
 
				-   "source": [
			
 
				-    "!pip install llama-index \n",
			
 
				-    "!pip install llama-index-core\n",
			
 
				-    "!pip install llama-index-llms-octoai\n",
			
 
				-    "!pip install llama-index-embeddings-octoai\n",
			
 
				-    "!pip install octoai-sdk\n",
			
 
				-    "!pip install tavily-python\n",
			
 
				-    "!pip install replicate"
			
 
				-   ]
			
 
				-  },
			
 
				-  {
			
 
				-   "cell_type": "markdown",
			
 
				-   "id": "73e8e661",
			
 
				-   "metadata": {},
			
 
				-   "source": [
			
 
				-    "Next we set up the OctoAI token."
			
 
				-   ]
			
 
				-  },
			
 
				-  {
			
 
				-   "cell_type": "code",
			
 
				-   "execution_count": null,
			
 
				-   "id": "d9d76e33",
			
 
				-   "metadata": {},
			
 
				-   "outputs": [],
			
 
				-   "source": [
			
 
				-    "from getpass import getpass\n",
			
 
				-    "import os\n",
			
 
				-    "\n",
			
 
				-    "OCTOAI_API_TOKEN = getpass()\n",
			
 
				-    "os.environ[\"OCTOAI_API_TOKEN\"] = OCTOAI_API_TOKEN"
			
 
				-   ]
			
 
				-  },
			
 
				-  {
			
 
				-   "cell_type": "markdown",
			
 
				-   "id": "cb210c7c",
			
 
				-   "metadata": {},
			
 
				-   "source": [
			
 
				-    "We then call the Llama 3 model from OctoAI.\n",
			
 
				-    "\n",
			
 
				-    "We will use the Llama 3 8b instruct model. You can find more on Llama models on the [OctoAI text generation solution page](https://octoai.cloud/text).\n",
			
 
				-    "\n",
			
 
				-    "At the time of writing this notebook the following Llama models are available on OctoAI:\n",
			
 
				-    "* meta-llama-3-8b-instruct\n",
			
 
				-    "* meta-llama-3-70b-instruct\n",
			
 
				-    "* codellama-7b-instruct\n",
			
 
				-    "* codellama-13b-instruct\n",
			
 
				-    "* codellama-34b-instruct\n",
			
 
				-    "* llama-2-13b-chat\n",
			
 
				-    "* llama-2-70b-chat\n",
			
 
				-    "* llamaguard-7b"
			
 
				-   ]
			
 
				-  },
			
 
				-  {
			
 
				-   "cell_type": "code",
			
 
				-   "execution_count": null,
			
 
				-   "id": "21fe3849",
			
 
				-   "metadata": {},
			
 
				-   "outputs": [],
			
 
				-   "source": [
			
 
				-    "# use ServiceContext to configure the LLM used and the custom embeddings\n",
			
 
				-    "from llama_index.core import ServiceContext\n",
			
 
				-    "\n",
			
 
				-    "# VectorStoreIndex is used to index custom data \n",
			
 
				-    "from llama_index.core import VectorStoreIndex\n",
			
 
				-    "\n",
			
 
				-    "from llama_index.core import Settings, VectorStoreIndex\n",
			
 
				-    "from llama_index.embeddings.octoai import OctoAIEmbedding\n",
			
 
				-    "from llama_index.llms.octoai import OctoAI\n",
			
 
				-    "\n",
			
 
				-    "Settings.llm = OctoAI(\n",
			
 
				-    "    model=\"meta-llama-3-8b-instruct\",\n",
			
 
				-    "    token=OCTOAI_API_TOKEN,\n",
			
 
				-    "    temperature=0.0,\n",
			
 
				-    "    max_tokens=128,\n",
			
 
				-    ")\n",
			
 
				-    "\n",
			
 
				-    "Settings.embed_model = OctoAIEmbedding(api_key=OCTOAI_API_TOKEN)"
			
 
				-   ]
			
 
				-  },
			
 
				-  {
			
 
				-   "cell_type": "markdown",
			
 
				-   "id": "f8ff812b",
			
 
				-   "metadata": {},
			
 
				-   "source": [
			
 
				-    "Next you will use the [Tavily](https://tavily.com/) search engine to augment the Llama 3's responses. To create a free trial Tavily Search API, sign in with your Google or Github account [here](https://app.tavily.com/sign-in)."
			
 
				-   ]
			
 
				-  },
			
 
				-  {
			
 
				-   "cell_type": "code",
			
 
				-   "execution_count": null,
			
 
				-   "id": "75275628-5235-4b55-8033-601c76107528",
			
 
				-   "metadata": {},
			
 
				-   "outputs": [],
			
 
				-   "source": [
			
 
				-    "from tavily import TavilyClient\n",
			
 
				-    "\n",
			
 
				-    "TAVILY_API_KEY = getpass()\n",
			
 
				-    "tavily = TavilyClient(api_key=TAVILY_API_KEY)"
			
 
				-   ]
			
 
				-  },
			
 
				-  {
			
 
				-   "cell_type": "markdown",
			
 
				-   "id": "476d72da",
			
 
				-   "metadata": {},
			
 
				-   "source": [
			
 
				-    "Do a live web search on \"Llama 3 fine-tuning\"."
			
 
				-   ]
			
 
				-  },
			
 
				-  {
			
 
				-   "cell_type": "code",
			
 
				-   "execution_count": null,
			
 
				-   "id": "effc9656-b18d-4d24-a80b-6066564a838b",
			
 
				-   "metadata": {},
			
 
				-   "outputs": [],
			
 
				-   "source": [
			
 
				-    "response = tavily.search(query=\"Llama 3 fine-tuning\")\n",
			
 
				-    "context = [{\"url\": obj[\"url\"], \"content\": obj[\"content\"]} for obj in response['results']]"
			
 
				-   ]
			
 
				-  },
			
 
				-  {
			
 
				-   "cell_type": "code",
			
 
				-   "execution_count": null,
			
 
				-   "id": "6b5af98b-c26b-4fd7-8031-31ac4915cdac",
			
 
				-   "metadata": {},
			
 
				-   "outputs": [],
			
 
				-   "source": [
			
 
				-    "context"
			
 
				-   ]
			
 
				-  },
			
 
				-  {
			
 
				-   "cell_type": "markdown",
			
 
				-   "id": "0f4ea96b-bb00-4a1f-8bd2-7f15237415f6",
			
 
				-   "metadata": {},
			
 
				-   "source": [
			
 
				-    "Create documents based on the search results, index and save them to a vector store, then create a query engine."
			
 
				-   ]
			
 
				-  },
			
 
				-  {
			
 
				-   "cell_type": "code",
			
 
				-   "execution_count": null,
			
 
				-   "id": "7513ac70-155a-4d56-b326-0e8c2733ab99",
			
 
				-   "metadata": {},
			
 
				-   "outputs": [],
			
 
				-   "source": [
			
 
				-    "from llama_index.core import Document\n",
			
 
				-    "\n",
			
 
				-    "documents = [Document(text=ct['content']) for ct in context]\n",
			
 
				-    "index = VectorStoreIndex.from_documents(documents)\n",
			
 
				-    "\n",
			
 
				-    "query_engine = index.as_query_engine(streaming=True)"
			
 
				-   ]
			
 
				-  },
			
 
				-  {
			
 
				-   "cell_type": "markdown",
			
 
				-   "id": "df743c62-165c-4834-b1f1-7d7848a6815e",
			
 
				-   "metadata": {},
			
 
				-   "source": [
			
 
				-    "You are now ready to ask Llama 3 questions about the live data using the query engine."
			
 
				-   ]
			
 
				-  },
			
 
				-  {
			
 
				-   "cell_type": "code",
			
 
				-   "execution_count": null,
			
 
				-   "id": "b2fd905b-575a-45f1-88da-9b093caa232a",
			
 
				-   "metadata": {},
			
 
				-   "outputs": [],
			
 
				-   "source": [
			
 
				-    "response = query_engine.query(\"give me a summary\")\n",
			
 
				-    "response.print_response_stream()"
			
 
				-   ]
			
 
				-  },
			
 
				-  {
			
 
				-   "cell_type": "code",
			
 
				-   "execution_count": null,
			
 
				-   "id": "88c45380-1d00-46d5-80ac-0eff68fd1f8a",
			
 
				-   "metadata": {},
			
 
				-   "outputs": [],
			
 
				-   "source": [
			
 
				-    "query_engine.query(\"what's the latest about Llama 3 fine-tuning?\").print_response_stream()"
			
 
				-   ]
			
 
				-  },
			
 
				-  {
			
 
				-   "cell_type": "code",
			
 
				-   "execution_count": null,
			
 
				-   "id": "0fe54976-5345-4426-a6f0-dc3bfd45dac3",
			
 
				-   "metadata": {},
			
 
				-   "outputs": [],
			
 
				-   "source": [
			
 
				-    "query_engine.query(\"tell me more about Llama 3 fine-tuning\").print_response_stream()"
			
 
				-   ]
			
 
				-  }
			
 
				- ],
			
 
				- "metadata": {
			
 
				-  "kernelspec": {
			
 
				-   "display_name": "Python 3 (ipykernel)",
			
 
				-   "language": "python",
			
 
				-   "name": "python3"
			
 
				-  },
			
 
				-  "language_info": {
			
 
				-   "codemirror_mode": {
			
 
				-    "name": "ipython",
			
 
				-    "version": 3
			
 
				-   },
			
 
				-   "file_extension": ".py",
			
 
				-   "mimetype": "text/x-python",
			
 
				-   "name": "python",
			
 
				-   "nbconvert_exporter": "python",
			
 
				-   "pygments_lexer": "ipython3",
			
 
				-   "version": "3.11.6"
			
 
				-  }
			
 
				- },
			
 
				- "nbformat": 4,
			
 
				- "nbformat_minor": 5
			
 
				-}
			
--- a/3p-integrations/octoai/llama2_gradio.ipynb
+++ b/3p-integrations/octoai/llama2_gradio.ipynb
@@ -1,116 +0,0 @@
 
				-{
			
 
				- "cells": [
			
 
				-  {
			
 
				-   "cell_type": "markdown",
			
 
				-   "id": "47a9adb3",
			
 
				-   "metadata": {},
			
 
				-   "source": [
			
 
				-    "## This demo app shows how to query Llama 3 using the Gradio UI.\n",
			
 
				-    "\n",
			
 
				-    "Since we are using OctoAI in this example, you'll need to obtain an OctoAI token:\n",
			
 
				-    "\n",
			
 
				-    "- You will need to first sign into [OctoAI](https://octoai.cloud/) with your Github or Google account\n",
			
 
				-    "- Then create a free API token [here](https://octo.ai/docs/getting-started/how-to-create-an-octoai-access-token) that you can use for a while (a month or $10 in OctoAI credits, whichever one runs out first)\n",
			
 
				-    "\n",
			
 
				-    "**Note** After the free trial ends, you will need to enter billing info to continue to use Llama 3 hosted on OctoAI.\n",
			
 
				-    "\n",
			
 
				-    "To run this example:\n",
			
 
				-    "- Run the notebook\n",
			
 
				-    "- Set up your OCTOAI API token and enter it when prompted\n",
			
 
				-    "- Enter your question and click Submit\n",
			
 
				-    "\n",
			
 
				-    "In the notebook or a browser with URL http://127.0.0.1:7860 you should see a UI with your answer.\n",
			
 
				-    "\n",
			
 
				-    "Let's start by installing the necessary packages:\n",
			
 
				-    "- openai for us to use its APIs to talk to the OctoAI endpoint\n",
			
 
				-    "- gradio is used for the UI elements\n",
			
 
				-    "\n",
			
 
				-    "And setting up the OctoAI token."
			
 
				-   ]
			
 
				-  },
			
 
				-  {
			
 
				-   "cell_type": "code",
			
 
				-   "execution_count": null,
			
 
				-   "id": "6ae4f858-6ef7-49d9-b45b-1ef79d0217a0",
			
 
				-   "metadata": {},
			
 
				-   "outputs": [],
			
 
				-   "source": [
			
 
				-    "!pip install openai gradio"
			
 
				-   ]
			
 
				-  },
			
 
				-  {
			
 
				-   "cell_type": "code",
			
 
				-   "execution_count": null,
			
 
				-   "id": "3306c11d-ed82-41c5-a381-15fb5c07d307",
			
 
				-   "metadata": {},
			
 
				-   "outputs": [],
			
 
				-   "source": [
			
 
				-    "from getpass import getpass\n",
			
 
				-    "import os\n",
			
 
				-    "\n",
			
 
				-    "OCTOAI_API_TOKEN = getpass()\n",
			
 
				-    "os.environ[\"OCTOAI_API_TOKEN\"] = OCTOAI_API_TOKEN"
			
 
				-   ]
			
 
				-  },
			
 
				-  {
			
 
				-   "cell_type": "code",
			
 
				-   "execution_count": null,
			
 
				-   "id": "928041cc",
			
 
				-   "metadata": {},
			
 
				-   "outputs": [],
			
 
				-   "source": [
			
 
				-    "import gradio as gr\n",
			
 
				-    "import openai\n",
			
 
				-    "\n",
			
 
				-    "# Init OctoAI client\n",
			
 
				-    "client = openai.OpenAI(\n",
			
 
				-    "    base_url=\"https://text.octoai.run/v1\",\n",
			
 
				-    "    api_key=os.environ[\"OCTOAI_API_TOKEN\"]\n",
			
 
				-    ")\n",
			
 
				-    "\n",
			
 
				-    "def predict(message, history):\n",
			
 
				-    "    history_openai_format = []\n",
			
 
				-    "    for human, assistant in history:\n",
			
 
				-    "        history_openai_format.append({\"role\": \"user\", \"content\": human})\n",
			
 
				-    "        history_openai_format.append({\"role\": \"assistant\", \"content\": assistant})\n",
			
 
				-    "    history_openai_format.append({\"role\": \"user\", \"content\": message})\n",
			
 
				-    "\n",
			
 
				-    "    response = client.chat.completions.create(\n",
			
 
				-    "        model = 'meta-llama-3-70b-instruct',\n",
			
 
				-    "        messages = history_openai_format,\n",
			
 
				-    "        temperature = 0.0,\n",
			
 
				-    "        stream = True\n",
			
 
				-    "     )\n",
			
 
				-    "\n",
			
 
				-    "    partial_message = \"\"\n",
			
 
				-    "    for chunk in response:\n",
			
 
				-    "        if chunk.choices[0].delta.content is not None:\n",
			
 
				-    "              partial_message = partial_message + chunk.choices[0].delta.content\n",
			
 
				-    "              yield partial_message\n",
			
 
				-    "\n",
			
 
				-    "gr.ChatInterface(predict).launch()"
			
 
				-   ]
			
 
				-  }
			
 
				- ],
			
 
				- "metadata": {
			
 
				-  "kernelspec": {
			
 
				-   "display_name": "Python 3 (ipykernel)",
			
 
				-   "language": "python",
			
 
				-   "name": "python3"
			
 
				-  },
			
 
				-  "language_info": {
			
 
				-   "codemirror_mode": {
			
 
				-    "name": "ipython",
			
 
				-    "version": 3
			
 
				-   },
			
 
				-   "file_extension": ".py",
			
 
				-   "mimetype": "text/x-python",
			
 
				-   "name": "python",
			
 
				-   "nbconvert_exporter": "python",
			
 
				-   "pygments_lexer": "ipython3",
			
 
				-   "version": "3.11.6"
			
 
				-  }
			
 
				- },
			
 
				- "nbformat": 4,
			
 
				- "nbformat_minor": 5
			
 
				-}
			
--- a/3p-integrations/octoai/video_summary.ipynb
+++ b/3p-integrations/octoai/video_summary.ipynb
@@ -1,335 +0,0 @@
 
				-{
			
 
				- "cells": [
			
 
				-  {
			
 
				-   "cell_type": "markdown",
			
 
				-   "id": "30b1235c-2f3e-4628-9c90-30385f741550",
			
 
				-   "metadata": {},
			
 
				-   "source": [
			
 
				-    "## This demo app shows:\n",
			
 
				-    "* How to use LangChain's YoutubeLoader to retrieve the caption in a YouTube video\n",
			
 
				-    "* How to ask Llama 3 to summarize the content (per the Llama's input size limit) of the video in a naive way using LangChain's stuff method\n",
			
 
				-    "* How to bypass the limit of Llama 3's max input token size by using a more sophisticated way using LangChain's map_reduce and refine methods - see [here](https://python.langchain.com/docs/tutorials/summarization/) for more info"
			
 
				-   ]
			
 
				-  },
			
 
				-  {
			
 
				-   "cell_type": "markdown",
			
 
				-   "id": "c866f6be",
			
 
				-   "metadata": {},
			
 
				-   "source": [
			
 
				-    "We start by installing the necessary packages:\n",
			
 
				-    "- [youtube-transcript-api](https://pypi.org/project/youtube-transcript-api/) API to get transcript/subtitles of a YouTube video\n",
			
 
				-    "- [langchain](https://python.langchain.com/docs/get_started/introduction) provides necessary RAG tools for this demo\n",
			
 
				-    "- [tiktoken](https://github.com/openai/tiktoken) BytePair Encoding tokenizer\n",
			
 
				-    "- [pytube](https://pytube.io/en/latest/) Utility for downloading YouTube videos\n",
			
 
				-    "\n",
			
 
				-    "**Note** This example uses OctoAI to host the Llama 3 model. If you have not set up/or used OctoAI before, we suggest you take a look at the [hello_llama_cloud](hello_llama_cloud.ipynb) example for information on how to set up OctoAI before continuing with this example.\n",
			
 
				-    "If you do not want to use OctoAI, you will need to make some changes to this notebook as you go along."
			
 
				-   ]
			
 
				-  },
			
 
				-  {
			
 
				-   "cell_type": "code",
			
 
				-   "execution_count": null,
			
 
				-   "id": "02482167",
			
 
				-   "metadata": {},
			
 
				-   "outputs": [],
			
 
				-   "source": [
			
 
				-    "!pip install langchain==0.1.19 youtube-transcript-api tiktoken pytube"
			
 
				-   ]
			
 
				-  },
			
 
				-  {
			
 
				-   "cell_type": "markdown",
			
 
				-   "id": "af3069b1",
			
 
				-   "metadata": {},
			
 
				-   "source": [
			
 
				-    "Let's first load a long (2:47:16) YouTube video (Lex Fridman with Yann Lecun: Meta AI, Open Source, Limits of LLMs, AGI & the Future of AI) transcript using the YoutubeLoader."
			
 
				-   ]
			
 
				-  },
			
 
				-  {
			
 
				-   "cell_type": "code",
			
 
				-   "execution_count": null,
			
 
				-   "id": "3e4b8598",
			
 
				-   "metadata": {},
			
 
				-   "outputs": [],
			
 
				-   "source": [
			
 
				-    "from langchain.document_loaders import YoutubeLoader\n",
			
 
				-    "\n",
			
 
				-    "loader = YoutubeLoader.from_youtube_url(\n",
			
 
				-    "    \"https://www.youtube.com/watch?v=5t1vTLU7s40\", add_video_info=True\n",
			
 
				-    ")"
			
 
				-   ]
			
 
				-  },
			
 
				-  {
			
 
				-   "cell_type": "code",
			
 
				-   "execution_count": null,
			
 
				-   "id": "dca32ebb",
			
 
				-   "metadata": {},
			
 
				-   "outputs": [],
			
 
				-   "source": [
			
 
				-    "# load the youtube video caption into Documents\n",
			
 
				-    "docs = loader.load()"
			
 
				-   ]
			
 
				-  },
			
 
				-  {
			
 
				-   "cell_type": "code",
			
 
				-   "execution_count": null,
			
 
				-   "id": "afba128f-b7fd-4b2f-873f-9b5163455d54",
			
 
				-   "metadata": {},
			
 
				-   "outputs": [],
			
 
				-   "source": [
			
 
				-    "# check the docs length and content\n",
			
 
				-    "len(docs[0].page_content), docs[0].page_content[:300]"
			
 
				-   ]
			
 
				-  },
			
 
				-  {
			
 
				-   "cell_type": "markdown",
			
 
				-   "id": "4af7cc16",
			
 
				-   "metadata": {},
			
 
				-   "source": [
			
 
				-    "You should see 142689 returned for the doc character length, which is about 30k words or 40k tokens, beyond the 8k context length limit of Llama 3. You'll see how to summarize a text longer than the limit.\n",
			
 
				-    "\n",
			
 
				-    "**Note**: We are using OctoAI in this example to host our Llama 3 model so you will need to get a OctoAI token.\n",
			
 
				-    "\n",
			
 
				-    "To get the OctoAI token:\n",
			
 
				-    "\n",
			
 
				-    "- You will need to first sign in with OctoAI with your github account\n",
			
 
				-    "- Then create a free API token [here](https://octo.ai/docs/getting-started/how-to-create-an-octoai-access-token) that you can use for a while (a month or $10 in OctoAI credits, whichever one runs out first)\n",
			
 
				-    "\n",
			
 
				-    "After the free trial ends, you will need to enter billing info to continue to use Llama2 hosted on OctoAI."
			
 
				-   ]
			
 
				-  },
			
 
				-  {
			
 
				-   "cell_type": "code",
			
 
				-   "execution_count": null,
			
 
				-   "id": "ab3ac00e",
			
 
				-   "metadata": {},
			
 
				-   "outputs": [],
			
 
				-   "source": [
			
 
				-    "# enter your OctoAI API token, or you can use local Llama. See README for more info\n",
			
 
				-    "from getpass import getpass\n",
			
 
				-    "import os\n",
			
 
				-    "\n",
			
 
				-    "OCTOAI_API_TOKEN = getpass()\n",
			
 
				-    "os.environ[\"OCTOAI_API_TOKEN\"] = OCTOAI_API_TOKEN"
			
 
				-   ]
			
 
				-  },
			
 
				-  {
			
 
				-   "cell_type": "markdown",
			
 
				-   "id": "6b911efd",
			
 
				-   "metadata": {},
			
 
				-   "source": [
			
 
				-    "Next we call the Llama 3 model from OctoAI. In this example we will use the Llama 3 8b instruct model. You can find more on Llama models on the [OctoAI text generation solution page](https://octoai.cloud/text).\n",
			
 
				-    "\n",
			
 
				-    "At the time of writing this notebook the following Llama models are available on OctoAI:\n",
			
 
				-    "* meta-llama-3-8b-instruct\n",
			
 
				-    "* meta-llama-3-70b-instruct\n",
			
 
				-    "* codellama-7b-instruct\n",
			
 
				-    "* codellama-13b-instruct\n",
			
 
				-    "* codellama-34b-instruct\n",
			
 
				-    "* llama-2-13b-chat\n",
			
 
				-    "* llama-2-70b-chat\n",
			
 
				-    "* llamaguard-7b"
			
 
				-   ]
			
 
				-  },
			
 
				-  {
			
 
				-   "cell_type": "code",
			
 
				-   "execution_count": null,
			
 
				-   "id": "adf8cf3d",
			
 
				-   "metadata": {},
			
 
				-   "outputs": [],
			
 
				-   "source": [
			
 
				-    "from langchain.llms.octoai_endpoint import OctoAIEndpoint\n",
			
 
				-    "\n",
			
 
				-    "llama3_8b = \"meta-llama-3-8b-instruct\"\n",
			
 
				-    "llm = OctoAIEndpoint(\n",
			
 
				-    "    model=llama3_8b,\n",
			
 
				-    "    max_tokens=500,\n",
			
 
				-    "    temperature=0.01\n",
			
 
				-    ")"
			
 
				-   ]
			
 
				-  },
			
 
				-  {
			
 
				-   "cell_type": "markdown",
			
 
				-   "id": "8e3baa56",
			
 
				-   "metadata": {},
			
 
				-   "source": [
			
 
				-    "Once everything is set up, we prompt Llama 3 to summarize the first 4000 characters of the transcript for us."
			
 
				-   ]
			
 
				-  },
			
 
				-  {
			
 
				-   "cell_type": "code",
			
 
				-   "execution_count": null,
			
 
				-   "id": "51739e11",
			
 
				-   "metadata": {},
			
 
				-   "outputs": [],
			
 
				-   "source": [
			
 
				-    "from langchain.prompts import PromptTemplate\n",
			
 
				-    "from langchain.chains import LLMChain\n",
			
 
				-    "\n",
			
 
				-    "prompt_template = \"Give me a summary of the text below: {text}?\"\n",
			
 
				-    "prompt = PromptTemplate(\n",
			
 
				-    "    input_variables=[\"text\"], template=prompt_template\n",
			
 
				-    ")\n",
			
 
				-    "chain = prompt | llm\n",
			
 
				-    "\n",
			
 
				-    "# be careful of the input text length sent to LLM\n",
			
 
				-    "text = docs[0].page_content[:10000]\n",
			
 
				-    "summary = chain.invoke(text)\n",
			
 
				-    "\n",
			
 
				-    "# Note: The context length of 8k tokens in Llama 3 is roughly 6000-7000 words or 32k characters\n",
			
 
				-    "print(summary)"
			
 
				-   ]
			
 
				-  },
			
 
				-  {
			
 
				-   "cell_type": "markdown",
			
 
				-   "id": "1ad1881a",
			
 
				-   "metadata": {},
			
 
				-   "source": [
			
 
				-    "If you try the whole content which has over 142k characters, about 40k tokens, which exceeds the 8k limit, you'll get an empty result (OctoAI used to return an error \"BadRequestError: The token count (32704) of your prompt (32204) + your setting of `max_tokens` (500) cannot exceed this model's context length (8192).\")."
			
 
				-   ]
			
 
				-  },
			
 
				-  {
			
 
				-   "cell_type": "code",
			
 
				-   "execution_count": null,
			
 
				-   "id": "61a088b7-cba2-4603-ba7c-f6673bfaa3cd",
			
 
				-   "metadata": {},
			
 
				-   "outputs": [],
			
 
				-   "source": [
			
 
				-    "# this will generate an empty result because the input exceeds Llama 3's context length limit\n",
			
 
				-    "text = docs[0].page_content\n",
			
 
				-    "summary = llm.invoke(f\"Give me a summary of the text below: {text}.\")\n",
			
 
				-    "print(summary)"
			
 
				-   ]
			
 
				-  },
			
 
				-  {
			
 
				-   "cell_type": "markdown",
			
 
				-   "id": "e112845f-de16-4c2f-8afe-6cca31f6fa38",
			
 
				-   "metadata": {},
			
 
				-   "source": [
			
 
				-    "To fix this, you can use LangChain's load_summarize_chain method (detail [here](https://python.langchain.com/docs/tutorials/summarization/)).\n",
			
 
				-    "\n",
			
 
				-    "First you'll create splits or sub-documents of the original content, then use the LangChain's `load_summarize_chain` with the `refine` or `map_reduce type`.\n",
			
 
				-    "\n",
			
 
				-    "Because this may involve many calls to Llama 3, it'd be great to set up a quick free LangChain API key [here](https://smith.langchain.com/settings), run the following cell to set up necessary environment variables, and check the logs on [LangSmith](https://docs.smith.langchain.com/) during and after the run."
			
 
				-   ]
			
 
				-  },
			
 
				-  {
			
 
				-   "cell_type": "code",
			
 
				-   "execution_count": null,
			
 
				-   "id": "55586a09-db53-4741-87d8-fdfb40d9f8cb",
			
 
				-   "metadata": {},
			
 
				-   "outputs": [],
			
 
				-   "source": [
			
 
				-    "import os\n",
			
 
				-    "os.environ[\"LANGCHAIN_API_KEY\"] = \"your_langchain_api_key\"\n",
			
 
				-    "os.environ[\"LANGCHAIN_TRACING_V2\"] = \"true\"\n",
			
 
				-    "os.environ[\"LANGCHAIN_PROJECT\"] = \"Video Summary with Llama 3\""
			
 
				-   ]
			
 
				-  },
			
 
				-  {
			
 
				-   "cell_type": "code",
			
 
				-   "execution_count": null,
			
 
				-   "id": "9bfee2d3-3afe-41d9-8968-6450cc23f493",
			
 
				-   "metadata": {},
			
 
				-   "outputs": [],
			
 
				-   "source": [
			
 
				-    "from langchain.text_splitter import RecursiveCharacterTextSplitter\n",
			
 
				-    "\n",
			
 
				-    "# we need to split the long input text\n",
			
 
				-    "text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(\n",
			
 
				-    "    chunk_size=1000, chunk_overlap=0\n",
			
 
				-    ")\n",
			
 
				-    "split_docs = text_splitter.split_documents(docs)"
			
 
				-   ]
			
 
				-  },
			
 
				-  {
			
 
				-   "cell_type": "code",
			
 
				-   "execution_count": null,
			
 
				-   "id": "682799a8-3846-41b1-a908-02ab5ac3ecee",
			
 
				-   "metadata": {},
			
 
				-   "outputs": [],
			
 
				-   "source": [
			
 
				-    "# check the splitted docs lengths\n",
			
 
				-    "len(split_docs), len(docs), len(split_docs[0].page_content), len(docs[0].page_content)"
			
 
				-   ]
			
 
				-  },
			
 
				-  {
			
 
				-   "cell_type": "markdown",
			
 
				-   "id": "aecf6328",
			
 
				-   "metadata": {},
			
 
				-   "source": [
			
 
				-    "The `refine` type implements the following steps under the hood:\n",
			
 
				-    "\n",
			
 
				-    "1. Call Llama 3 on the first sub-document to generate a concise summary;\n",
			
 
				-    "2. Loop over each subsequent sub-document, pass the previous summary with the current sub-document to generate a refined new summary;\n",
			
 
				-    "3. Return the final summary generated on the final sub-document as the final answer - the summary of the whole content.\n",
			
 
				-    "\n",
			
 
				-    "An example prompt template for each call in step 2, which gets used under the hood by LangChain, is:\n",
			
 
				-    "\n",
			
 
				-    "```\n",
			
 
				-    "Your job is to produce a final summary.\n",
			
 
				-    "We have provided an existing summary up to a certain point:\n",
			
 
				-    "<previous_summary>\n",
			
 
				-    "Refine the existing summary (only if needed) with some more content below:\n",
			
 
				-    "<new_content>\n",
			
 
				-    "```\n",
			
 
				-    "\n",
			
 
				-    "**Note**: The following call will make 33 calls to Llama 3 and generate the final summary in about 10 minutes."
			
 
				-   ]
			
 
				-  },
			
 
				-  {
			
 
				-   "cell_type": "code",
			
 
				-   "execution_count": null,
			
 
				-   "id": "3be1236a-fe6a-4bf6-983f-0e72dde39fee",
			
 
				-   "metadata": {},
			
 
				-   "outputs": [],
			
 
				-   "source": [
			
 
				-    "from langchain.chains.summarize import load_summarize_chain\n",
			
 
				-    "\n",
			
 
				-    "chain = load_summarize_chain(llm, chain_type=\"refine\")\n",
			
 
				-    "print(chain.run(split_docs))"
			
 
				-   ]
			
 
				-  },
			
 
				-  {
			
 
				-   "cell_type": "markdown",
			
 
				-   "id": "752f2b71-5fd6-4a8a-ac09-371bce1db703",
			
 
				-   "metadata": {},
			
 
				-   "source": [
			
 
				-    "You can also set `chain_type` to `map_reduce` to generate the summary of the entire content using the standard map and reduce method, which works behind the scene by first mapping each split document to a sub-summary via a call to LLM, then combines all those sub-summaries into a single final summary by yet another call to LLM.\n",
			
 
				-    "\n",
			
 
				-    "**Note**: The following call takes about 3 minutes and all the calls to Llama 3."
			
 
				-   ]
			
 
				-  },
			
 
				-  {
			
 
				-   "cell_type": "code",
			
 
				-   "execution_count": null,
			
 
				-   "id": "8991df49-8578-46de-8b30-cb2cd11e30f1",
			
 
				-   "metadata": {},
			
 
				-   "outputs": [],
			
 
				-   "source": [
			
 
				-    "chain = load_summarize_chain(llm, chain_type=\"map_reduce\")\n",
			
 
				-    "print(chain.run(split_docs))"
			
 
				-   ]
			
 
				-  }
			
 
				- ],
			
 
				- "metadata": {
			
 
				-  "kernelspec": {
			
 
				-   "display_name": "Python 3 (ipykernel)",
			
 
				-   "language": "python",
			
 
				-   "name": "python3"
			
 
				-  },
			
 
				-  "language_info": {
			
 
				-   "codemirror_mode": {
			
 
				-    "name": "ipython",
			
 
				-    "version": 3
			
 
				-   },
			
 
				-   "file_extension": ".py",
			
 
				-   "mimetype": "text/x-python",
			
 
				-   "name": "python",
			
 
				-   "nbconvert_exporter": "python",
			
 
				-   "pygments_lexer": "ipython3",
			
 
				-   "version": "3.11.6"
			
 
				-  }
			
 
				- },
			
 
				- "nbformat": 4,
			
 
				- "nbformat_minor": 5
			
 
				-}
			
--- a/end-to-end-use-cases/README.md
+++ b/end-to-end-use-cases/README.md
@@ -19,10 +19,6 @@ The scripts apply a throughput analysis and introduction to `lm-evaluation-harne
 
				 ### Use Llama to automatically triage issues in an OSS repository and generate insights to improve community experience
			
 
				 This tool utilizes an off-the-shelf Llama model to analyze, generate insights, and create a report for better understanding of the state of a repository. It serves as a reference implementation for using Llama to develop custom reporting and data analytics applications.
			
 
				 
			
 
				-## [VideoSummary](video_summary.ipynb): 
			
 
				-
			
 
				-### Ask Llama 3 to Summarize a Long YouTube Video (using Replicate or [OctoAI](../3p-integrations/octoai/video_summary.ipynb))
			
 
				-This demo app uses Llama 3 to return a text summary of a YouTube video. It shows how to retrieve the caption of a YouTube video and how to ask Llama to summarize the content in different ways, from the simplest naive way that works for short text to more advanced methods of using LangChain's map_reduce and refine to overcome the 8K context length limit of Llama 3.
			
 
				 
			
 
				 ## [NBA2023-24](./coding/text2sql/quickstart.ipynb): 
			
 
				 
			
@@ -34,10 +30,6 @@ This demo app shows how to use LangChain and Llama 3 to let users ask questions
 
				 ### PDF to Podcast using Llama Models
			
 
				 Workflow showcasing how to use multiple Llama models to go from any PDF to a Podcast and using open models to generate a multi-speaker podcast
			
 
				 
			
 
				-## [live_data](live_data.ipynb): 
			
 
				-
			
 
				-### Ask Llama 3 about Live Data (using Replicate or [OctoAI](../3p-integrations/octoai/live_data.ipynb))
			
 
				-This demo app shows how to perform live data augmented generation tasks with Llama 3, [LlamaIndex](https://github.com/run-llama/llama_index), another leading open-source framework for building LLM apps, and the [Tavily](https://tavily.com) live search API.
			
 
				 
			
 
				 ## [WhatsApp Chatbot](./customerservice_chatbots/whatsapp_chatbot/whatsapp_llama3.md): 
			
 
				 ### Building a Llama 3 Enabled WhatsApp Chatbot
			
@@ -48,10 +40,5 @@ This step-by-step tutorial shows how to use the [WhatsApp Business API](https://
 
				 ### Building a Llama 3 Enabled Messenger Chatbot
			
 
				 This step-by-step tutorial shows how to use the [Messenger Platform](https://developers.facebook.com/docs/messenger-platform/overview) to build a Llama 3 enabled Messenger chatbot.
			
 
				 
			
 
				-### RAG Chatbot Example (running [locally](./customerservice_chatbots/RAG_chatbot/RAG_Chatbot_Example.ipynb) or on [OctoAI](../3p-integrations/octoai/RAG_chatbot_example/RAG_chatbot_example.ipynb))
			
 
				-A complete example of how to build a Llama 3 chatbot hosted on your browser that can answer questions based on your own data using retrieval augmented generation (RAG). You can run Llama2 locally if you have a good enough GPU or on OctoAI if you follow the note [here](../README.md#octoai_note).
			
 
				-
			
 
				-## [Sales Bot](./customerservice_chatbots/ai_agent_chatbot/SalesBot.ipynb): 
			
 
				-
			
 
				-### Sales Bot with Llama3 - A Summarization and RAG Use Case
			
 
				-An summarization + RAG use case built around the Amazon product review Kaggle dataset to build a helpful Music Store Sales Bot. The summarization and RAG are built on top of Llama models hosted on OctoAI, and the vector database is hosted on Weaviate Cloud Services.
			
 
				+### RAG Chatbot Example (running [locally](./customerservice_chatbots/RAG_chatbot/RAG_Chatbot_Example.ipynb)
			
 
				+A complete example of how to build a Llama 3 chatbot hosted on your browser that can answer questions based on your own data using retrieval augmented generation (RAG). 
			
--- a/end-to-end-use-cases/customerservice_chatbots/ai_agent_chatbot/SalesBot.ipynb
+++ b/end-to-end-use-cases/customerservice_chatbots/ai_agent_chatbot/SalesBot.ipynb
@@ -1,668 +0,0 @@
 
				-{
			
 
				- "cells": [
			
 
				-  {
			
 
				-   "cell_type": "markdown",
			
 
				-   "id": "374b67d0-b446-4d6f-8e07-59e97716c55a",
			
 
				-   "metadata": {},
			
 
				-   "source": [
			
 
				-    "# Sales Bot with Llama3 - A Summarization and RAG Use Case"
			
 
				-   ]
			
 
				-  },
			
 
				-  {
			
 
				-   "cell_type": "markdown",
			
 
				-   "id": "add4953d-07c3-4480-ad91-7d0ea9c9fb55",
			
 
				-   "metadata": {},
			
 
				-   "source": [
			
 
				-    "## Overview\n",
			
 
				-    "\n",
			
 
				-    "In this notebook you'll take an Amazon product reviews dataset from Kaggle and use Llama3 to obtain product review summaries, upsert those summaries in a vector database, then use Retrieval Augmented Generation (RAG) to power a sales chatbot that can make targeted product recommendations.\n",
			
 
				-    "\n",
			
 
				-    "Let's take a look at the overall workflow:\n",
			
 
				-    "1. We start with a dataset that contains over 10,000 reviews across 900 Amazon musical instruments and accessories.\n",
			
 
				-    "2. Using Llama2 70B chat (hosted on OctoAI), we generate summaries of product reviews for each product from the 20 most recent reviews. We format the summaries in JSON format.\n",
			
 
				-    "3. We then take the summaries and upsert them into a vector database (Weaviate in this case)\n",
			
 
				-    "4. We then use this vector database and Llama3 70B instruct (hosted on OctoAI) to build a RAG-based sales chatbot that provides targeted recommendations to the user based on the products that are present in the inventory.\n",
			
 
				-    "\n",
			
 
				-    "Note: at the time of writing this tutorial, JSON mode formatting isn't supported for Llama 3 on OctoAI via constrained sampling which is why we are falling back onto Llama 2. This tutorial will be updated when the feature becomes available to rely on Llama 3 exclusively.\n",
			
 
				-    "\n",
			
 
				-    "### OctoAI\n",
			
 
				-    "We'll use [OctoAI](https://octo.ai/) to power all of the GenAI model needs of this notebook: LLMs, image gen, image animation.\n",
			
 
				-    "* To use OctoAI, you'll need to go to https://octoai.cloud/ and sign in using your Google or GitHub account.\n",
			
 
				-    "* Next you'll need to generate an OctoAI API token by following these [instructions](https://octo.ai/docs/getting-started/how-to-create-an-octoai-access-token). Keep the API token in hand, we'll need it further down in this notebook.\n",
			
 
				-    "\n",
			
 
				-    "In this example we will use the Llama 3 70b instruct model. You can find more on Llama models on the [OctoAI text generation solution page](https://octoai.cloud/text).\n",
			
 
				-    "\n",
			
 
				-    "At the time of writing this notebook the following Llama models are available on OctoAI:\n",
			
 
				-    "* meta-llama-3-8b-instruct\n",
			
 
				-    "* meta-llama-3-70b-instruct\n",
			
 
				-    "* codellama-7b-instruct\n",
			
 
				-    "* codellama-13b-instruct\n",
			
 
				-    "* codellama-34b-instruct\n",
			
 
				-    "* llama-2-13b-chat\n",
			
 
				-    "* llama-2-70b-chat\n",
			
 
				-    "* llamaguard-7b\n",
			
 
				-    "\n",
			
 
				-    "### Weaviate\n",
			
 
				-    "We'll use Weaviate Cloud Services (WCS) for our vector database. You can create an account and Weaviate clusters easily at the following link: https://console.weaviate.cloud/.\n",
			
 
				-    "You can then create a cluster, from which you can obtain the REST Endpoint URL and the API key to use the cluster endpoint.\n",
			
 
				-    "\n",
			
 
				-    "### OpenAI\n",
			
 
				-    "We'll be using OpenAI for its embedding model to upsert our vectors into the Weaviate vector database. Create an account and obtain an API key here: https://openai.com/api/\n",
			
 
				-    "\n",
			
 
				-    "### Local Python Notebook\n",
			
 
				-    "We highly recommend launching this notebook from a fresh python environment, for instance you can run the following:\n",
			
 
				-    "```\n",
			
 
				-    "python3 -m venv .venv         \n",
			
 
				-    "source .venv/bin/activate\n",
			
 
				-    "```\n",
			
 
				-    "All you need to run this notebook is to install jupyter notebook with `python3 -m pip install notebook` then run `jupyter notebook` ([link](https://jupyter.org/install)) in the same directory as this `.ipynb` file.\n",
			
 
				-    "You don't need to install additional pip packages ahead of running the notebook, since those will be installed right at the beginning. You will need to ensure your system has `imagemagick` installed by following the [instructions](https://imagemagick.org/script/download.php)."
			
 
				-   ]
			
 
				-  },
			
 
				-  {
			
 
				-   "cell_type": "code",
			
 
				-   "execution_count": null,
			
 
				-   "id": "133c2ea4-0256-49cf-9f5a-a9e5bb0bb63f",
			
 
				-   "metadata": {},
			
 
				-   "outputs": [],
			
 
				-   "source": [
			
 
				-    "# Let's start by installing the appropriate python packages\n",
			
 
				-    "! pip install octoai===1.0.2 openai weaviate-client pandas gradio pydantic"
			
 
				-   ]
			
 
				-  },
			
 
				-  {
			
 
				-   "cell_type": "markdown",
			
 
				-   "id": "75341227-43f8-4a68-b3cb-31e8216f874e",
			
 
				-   "metadata": {},
			
 
				-   "source": [
			
 
				-    "## Part 1: Review Summarization"
			
 
				-   ]
			
 
				-  },
			
 
				-  {
			
 
				-   "cell_type": "markdown",
			
 
				-   "id": "793c06d7-fa67-4c67-a380-081ed3a7a7bf",
			
 
				-   "metadata": {},
			
 
				-   "source": [
			
 
				-    "Let's start by importing all of the packages we need for this example"
			
 
				-   ]
			
 
				-  },
			
 
				-  {
			
 
				-   "cell_type": "code",
			
 
				-   "execution_count": null,
			
 
				-   "id": "edd366c8-4f0b-4211-83d3-c16e88cbd5c8",
			
 
				-   "metadata": {},
			
 
				-   "outputs": [],
			
 
				-   "source": [
			
 
				-    "import gradio\n",
			
 
				-    "import json\n",
			
 
				-    "import langchain\n",
			
 
				-    "import os\n",
			
 
				-    "import openai\n",
			
 
				-    "import weaviate\n",
			
 
				-    "from getpass import getpass\n",
			
 
				-    "from json import loads\n",
			
 
				-    "from pandas import DataFrame, concat, read_csv\n",
			
 
				-    "from pydantic import BaseModel, Field\n",
			
 
				-    "from typing import List\n",
			
 
				-    "import weaviate.classes as wvc"
			
 
				-   ]
			
 
				-  },
			
 
				-  {
			
 
				-   "cell_type": "markdown",
			
 
				-   "id": "cd171a7c-c5e7-46d5-8a04-a0f7863609be",
			
 
				-   "metadata": {},
			
 
				-   "source": [
			
 
				-    "Enter your OctoAI, Weaviate, and OpenAI tokens below"
			
 
				-   ]
			
 
				-  },
			
 
				-  {
			
 
				-   "cell_type": "code",
			
 
				-   "execution_count": null,
			
 
				-   "id": "3af09686-a654-45b0-98c5-dee6f30440c7",
			
 
				-   "metadata": {},
			
 
				-   "outputs": [],
			
 
				-   "source": [
			
 
				-    "# Get OctoAI API token for Llama 2 & 3\n",
			
 
				-    "OCTOAI_API_TOKEN = getpass()\n",
			
 
				-    "os.environ[\"OCTOAI_API_TOKEN\"] = OCTOAI_API_TOKEN"
			
 
				-   ]
			
 
				-  },
			
 
				-  {
			
 
				-   "cell_type": "code",
			
 
				-   "execution_count": null,
			
 
				-   "id": "31c3e684-6e5e-41ad-81d4-970b06522553",
			
 
				-   "metadata": {},
			
 
				-   "outputs": [],
			
 
				-   "source": [
			
 
				-    "# Get WCS API key\n",
			
 
				-    "WCS_API_KEY = getpass()\n",
			
 
				-    "os.environ[\"WCS_API_KEY\"] = WCS_API_KEY"
			
 
				-   ]
			
 
				-  },
			
 
				-  {
			
 
				-   "cell_type": "code",
			
 
				-   "execution_count": null,
			
 
				-   "id": "a44f7b71-c4f9-4fd6-9a3b-1322c2fd0c35",
			
 
				-   "metadata": {},
			
 
				-   "outputs": [],
			
 
				-   "source": [
			
 
				-    "# Get WCS URL\n",
			
 
				-    "WCS_URL = getpass()\n",
			
 
				-    "os.environ[\"WCS_URL\"] = WCS_URL"
			
 
				-   ]
			
 
				-  },
			
 
				-  {
			
 
				-   "cell_type": "code",
			
 
				-   "execution_count": null,
			
 
				-   "id": "e4502dfa-c369-4085-a697-fdcda00f970b",
			
 
				-   "metadata": {},
			
 
				-   "outputs": [],
			
 
				-   "source": [
			
 
				-    "# Get OpenAI API key for the embedding model\n",
			
 
				-    "OPENAI_API_KEY = getpass()\n",
			
 
				-    "os.environ[\"OPENAI_API_KEY\"] = OPENAI_API_KEY"
			
 
				-   ]
			
 
				-  },
			
 
				-  {
			
 
				-   "cell_type": "code",
			
 
				-   "execution_count": null,
			
 
				-   "id": "883986ad-9f60-44d8-ab64-3f566261e055",
			
 
				-   "metadata": {},
			
 
				-   "outputs": [],
			
 
				-   "source": [
			
 
				-    "# First let's load the dataset from Kaggle: https://www.kaggle.com/datasets/eswarchandt/amazon-music-reviews\n",
			
 
				-    "df = read_csv('Musical_instruments_reviews.csv')"
			
 
				-   ]
			
 
				-  },
			
 
				-  {
			
 
				-   "cell_type": "markdown",
			
 
				-   "id": "c05865a7-307a-425e-a6ee-f057d63db77b",
			
 
				-   "metadata": {},
			
 
				-   "source": [
			
 
				-    "Set `product_record_limit` to a lower number if you just want to do a test run"
			
 
				-   ]
			
 
				-  },
			
 
				-  {
			
 
				-   "cell_type": "code",
			
 
				-   "execution_count": null,
			
 
				-   "id": "22f024e7-3976-425f-b684-8b2c2c1ed191",
			
 
				-   "metadata": {},
			
 
				-   "outputs": [],
			
 
				-   "source": [
			
 
				-    "# Set a product record limit\n",
			
 
				-    "product_record_limit = 900"
			
 
				-   ]
			
 
				-  },
			
 
				-  {
			
 
				-   "cell_type": "markdown",
			
 
				-   "id": "06554f51-5983-42fc-8a8e-684ae82099db",
			
 
				-   "metadata": {
			
 
				-    "scrolled": true
			
 
				-   },
			
 
				-   "source": [
			
 
				-    "# List all of the unique ASIN:\n",
			
 
				-    "asin_list = df.asin.unique()\n",
			
 
				-    "print(\"There are {} unique products in the music product inventory\".format(len(asin_list)))"
			
 
				-   ]
			
 
				-  },
			
 
				-  {
			
 
				-   "cell_type": "markdown",
			
 
				-   "id": "4941baa1-107b-4f39-8d04-1daa5acd465b",
			
 
				-   "metadata": {},
			
 
				-   "source": [
			
 
				-    "For each one of the unique products, let's group the reviews together and sort them by how recent they are"
			
 
				-   ]
			
 
				-  },
			
 
				-  {
			
 
				-   "cell_type": "code",
			
 
				-   "execution_count": null,
			
 
				-   "id": "38147b91-2425-46a7-b6c0-221173d81024",
			
 
				-   "metadata": {},
			
 
				-   "outputs": [],
			
 
				-   "source": [
			
 
				-    "# Get the reviews for the product ASIN, sorted by recency and store in dict\n",
			
 
				-    "review_dict = {}\n",
			
 
				-    "for asin in asin_list[0:product_record_limit]:\n",
			
 
				-    "    reviews = df.loc[df['asin'] == asin]\\\n",
			
 
				-    "                .sort_values([\"unixReviewTime\"], axis=0, ascending=False)\\\n",
			
 
				-    "                .reviewText.tolist()\n",
			
 
				-    "    review_dict[asin] = reviews"
			
 
				-   ]
			
 
				-  },
			
 
				-  {
			
 
				-   "cell_type": "markdown",
			
 
				-   "id": "7d5fb78d-808a-4753-abba-4a3066d76ba7",
			
 
				-   "metadata": {},
			
 
				-   "source": [
			
 
				-    "To be able to store our summaries into our vector DB, we need to have the fields formatted into a JSON object. We use Pydantic base class model here to define our formatting."
			
 
				-   ]
			
 
				-  },
			
 
				-  {
			
 
				-   "cell_type": "code",
			
 
				-   "execution_count": null,
			
 
				-   "id": "b786cde1-116a-47eb-8478-3fa2285dcf9d",
			
 
				-   "metadata": {},
			
 
				-   "outputs": [],
			
 
				-   "source": [
			
 
				-    "# Define the Pydantic model that specifies how our output should be formatted\n",
			
 
				-    "class ProductRecord(BaseModel):\n",
			
 
				-    "    \"\"\"The record of a given product\"\"\"\n",
			
 
				-    "    description: str = Field(description=\"Description of the product\")\n",
			
 
				-    "    name: str = Field(description=\"Name of the product\")\n",
			
 
				-    "    review_summary: str = Field(description=\"Summary of all of the reviews\")\n",
			
 
				-    "    ASIN: str = Field(description=\"ASIN of the product\")\n",
			
 
				-    "    features: str = Field(description=\"Features of the product based on the reviews\")"
			
 
				-   ]
			
 
				-  },
			
 
				-  {
			
 
				-   "cell_type": "markdown",
			
 
				-   "id": "08226a6e-f994-454b-9a1d-6246b34bfca2",
			
 
				-   "metadata": {},
			
 
				-   "source": [
			
 
				-    "We define our prompt template below."
			
 
				-   ]
			
 
				-  },
			
 
				-  {
			
 
				-   "cell_type": "code",
			
 
				-   "execution_count": null,
			
 
				-   "id": "1cc3fe69-bf0c-4a50-8d9c-1ae6cb99a9ca",
			
 
				-   "metadata": {},
			
 
				-   "outputs": [],
			
 
				-   "source": [
			
 
				-    "# Prepare a prompt template\n",
			
 
				-    "template = '''\n",
			
 
				-    "Here are product reviews for a music product with an ID of {asin}.\n",
			
 
				-    " - Respond back only as only JSON!\n",
			
 
				-    " - Provide:\n",
			
 
				-    "     - the product \"description\",\n",
			
 
				-    "     - the product \"name\",\n",
			
 
				-    "     - a summary of all the reviews as \"review_summary\",\n",
			
 
				-    "     - the \"ASIN\" and\n",
			
 
				-    "     - and the product \"features\" based on the content of these reviews. \n",
			
 
				-    " - The \"features\" should be a string describing the features and NOT JSON. \n",
			
 
				-    " - Do not include the ASIN in the description field.\n",
			
 
				-    " \n",
			
 
				-    "The reviews for the product are: {reviews}\n",
			
 
				-    "'''"
			
 
				-   ]
			
 
				-  },
			
 
				-  {
			
 
				-   "cell_type": "markdown",
			
 
				-   "id": "9b8dc3fa-4ad9-4329-96a0-353b05a1c43e",
			
 
				-   "metadata": {},
			
 
				-   "source": [
			
 
				-    "We initialize the OctoAI client using OpenAI's API. All we have to do is override the `base_url` and `api_key`."
			
 
				-   ]
			
 
				-  },
			
 
				-  {
			
 
				-   "cell_type": "code",
			
 
				-   "execution_count": null,
			
 
				-   "id": "57c2ff0a-8029-41a6-a06f-41e560b92230",
			
 
				-   "metadata": {},
			
 
				-   "outputs": [],
			
 
				-   "source": [
			
 
				-    "# Init OctoAI client\n",
			
 
				-    "client = openai.OpenAI(\n",
			
 
				-    "    base_url=\"https://text.octoai.run/v1\",\n",
			
 
				-    "    api_key=os.environ[\"OCTOAI_API_TOKEN\"]\n",
			
 
				-    ")"
			
 
				-   ]
			
 
				-  },
			
 
				-  {
			
 
				-   "cell_type": "markdown",
			
 
				-   "id": "bd0eb425-ceea-4258-a52d-814b7335febb",
			
 
				-   "metadata": {},
			
 
				-   "source": [
			
 
				-    "Iterate over all product ASINs and summarize the top 20 most recent reviews. Note: this takes a while to run unless we parallelize it."
			
 
				-   ]
			
 
				-  },
			
 
				-  {
			
 
				-   "cell_type": "code",
			
 
				-   "execution_count": null,
			
 
				-   "id": "1a55839e-a824-4919-b755-730eaac48d83",
			
 
				-   "metadata": {},
			
 
				-   "outputs": [],
			
 
				-   "source": [
			
 
				-    "# Produce the 900 product summaries\n",
			
 
				-    "review_summaries = []\n",
			
 
				-    "counter = 0\n",
			
 
				-    "\n",
			
 
				-    "# This can take a while to process serially (30min+)\n",
			
 
				-    "# TODO: Optimize to run in a few parallel threads to run faster while meeting the 240RPM limit\n",
			
 
				-    "for asin, review_list in review_dict.items():\n",
			
 
				-    "    print(f'Getting review summary {counter} of {len(review_dict)}, ASIN: {asin}')\n",
			
 
				-    "    try:\n",
			
 
				-    "        response = client.chat.completions.create(\n",
			
 
				-    "            model=\"llama-2-70b-chat\",\n",
			
 
				-    "            messages=[\n",
			
 
				-    "                {\"role\": \"system\", \"content\": \"You are a helpful assistant.\"},\n",
			
 
				-    "                {\"role\": \"user\", \"content\": template.format(\n",
			
 
				-    "                    asin = asin,\n",
			
 
				-    "                    reviews = review_list[0:20]\n",
			
 
				-    "                )},\n",
			
 
				-    "            ],\n",
			
 
				-    "            temperature=0,\n",
			
 
				-    "            response_format={\"type\": \"json_object\", \"schema\": ProductRecord.model_json_schema()},\n",
			
 
				-    "            max_tokens=1024\n",
			
 
				-    "        )\n",
			
 
				-    "        print(\"\\n{}\\n\".format(response.choices[0].message.content))\n",
			
 
				-    "        summary = loads(response.choices[0].message.content)\n",
			
 
				-    "        summary[\"ASIN\"] = asin\n",
			
 
				-    "        review_summaries.append(summary)\n",
			
 
				-    "    except:\n",
			
 
				-    "        print(f'Issue with ASIN {asin}, skipping')\n",
			
 
				-    "        pass\n",
			
 
				-    "    counter += 1\n",
			
 
				-    "\n",
			
 
				-    "review_summaries = DataFrame(review_summaries)\n",
			
 
				-    "\n",
			
 
				-    "print(review_summaries)"
			
 
				-   ]
			
 
				-  },
			
 
				-  {
			
 
				-   "cell_type": "markdown",
			
 
				-   "id": "4772d1c1-c9c4-466e-9c80-259804a4286b",
			
 
				-   "metadata": {},
			
 
				-   "source": [
			
 
				-    "# Part 2: Retrieval Augmented Generation"
			
 
				-   ]
			
 
				-  },
			
 
				-  {
			
 
				-   "cell_type": "markdown",
			
 
				-   "id": "ccd97408-d47f-46f3-b601-f66f8a3b20ff",
			
 
				-   "metadata": {},
			
 
				-   "source": [
			
 
				-    "For our RAG use case we're going to rely on Weaviate vector database and on an OpenAI embedding model. \n",
			
 
				-    "\n",
			
 
				-    "When you define your collection, you'll need to provide properties, i.e. object attributes that you want to store in the collection. These properties map 1:1 to the JSON dictionary keys defined earlier for the `ProductRecord` Pydantic base model."
			
 
				-   ]
			
 
				-  },
			
 
				-  {
			
 
				-   "cell_type": "code",
			
 
				-   "execution_count": null,
			
 
				-   "id": "5dad98ec-531d-4fc2-aed9-9f337b957feb",
			
 
				-   "metadata": {},
			
 
				-   "outputs": [],
			
 
				-   "source": [
			
 
				-    "# Connect to WCS\n",
			
 
				-    "wcs_client = weaviate.connect_to_wcs(\n",
			
 
				-    "    cluster_url=os.getenv(\"WCS_URL\"),\n",
			
 
				-    "    auth_credentials=weaviate.auth.AuthApiKey(os.getenv(\"WCS_API_KEY\")),\n",
			
 
				-    "    headers={\n",
			
 
				-    "        \"X-OpenAI-Api-Key\": os.environ[\"OPENAI_API_KEY\"]\n",
			
 
				-    "    }\n",
			
 
				-    ")"
			
 
				-   ]
			
 
				-  },
			
 
				-  {
			
 
				-   "cell_type": "code",
			
 
				-   "execution_count": null,
			
 
				-   "id": "02953f7b-0149-4c13-a7cc-c4dd1da45d43",
			
 
				-   "metadata": {},
			
 
				-   "outputs": [],
			
 
				-   "source": [
			
 
				-    "# Create the collection if it doesn't already exist\n",
			
 
				-    "try:\n",
			
 
				-    "    collection = wcs_client.collections.get(\"Products\")\n",
			
 
				-    "except:\n",
			
 
				-    "    # Create the collection for products\n",
			
 
				-    "    collection = wcs_client.collections.create(\n",
			
 
				-    "        name=\"Products\",\n",
			
 
				-    "        vectorizer_config=wvc.config.Configure.Vectorizer.text2vec_openai(),\n",
			
 
				-    "        properties=[\n",
			
 
				-    "            wvc.config.Property(\n",
			
 
				-    "                name=\"ASIN\",\n",
			
 
				-    "                data_type=wvc.config.DataType.TEXT\n",
			
 
				-    "            ),\n",
			
 
				-    "             wvc.config.Property(\n",
			
 
				-    "                name=\"name\",\n",
			
 
				-    "                data_type=wvc.config.DataType.TEXT\n",
			
 
				-    "            ),\n",
			
 
				-    "             wvc.config.Property(\n",
			
 
				-    "                name=\"review_summary\",\n",
			
 
				-    "                data_type=wvc.config.DataType.TEXT\n",
			
 
				-    "            ),\n",
			
 
				-    "             wvc.config.Property(\n",
			
 
				-    "                name=\"features\",\n",
			
 
				-    "                data_type=wvc.config.DataType.TEXT\n",
			
 
				-    "            ),\n",
			
 
				-    "             wvc.config.Property(\n",
			
 
				-    "                name=\"description\",\n",
			
 
				-    "                data_type=wvc.config.DataType.TEXT\n",
			
 
				-    "            ),\n",
			
 
				-    "        ]\n",
			
 
				-    "    )\n",
			
 
				-    "    print(\"Collection Created!\")"
			
 
				-   ]
			
 
				-  },
			
 
				-  {
			
 
				-   "cell_type": "markdown",
			
 
				-   "id": "1551fd74-b143-4c02-9b56-364d33683fd3",
			
 
				-   "metadata": {},
			
 
				-   "source": [
			
 
				-    "Now we upsert all of the vectors into the database using OpenAI's embedding model."
			
 
				-   ]
			
 
				-  },
			
 
				-  {
			
 
				-   "cell_type": "code",
			
 
				-   "execution_count": null,
			
 
				-   "id": "53f779e7-b875-4a19-9f9c-74b45992608e",
			
 
				-   "metadata": {},
			
 
				-   "outputs": [],
			
 
				-   "source": [
			
 
				-    "# Convert df to JSON string and then to a list of dictionaries\n",
			
 
				-    "data = review_summaries.to_json(orient='records')\n",
			
 
				-    "data_list = json.loads(data)\n",
			
 
				-    "\n",
			
 
				-    "items_to_insert = []\n",
			
 
				-    "\n",
			
 
				-    "for d in data_list:\n",
			
 
				-    "    new_item = {\n",
			
 
				-    "        \"ASIN\": d[\"ASIN\"],\n",
			
 
				-    "        \"name\": d[\"name\"],\n",
			
 
				-    "        \"description\": d[\"description\"],  \\\n",
			
 
				-    "        \"features\": d[\"features\"],\n",
			
 
				-    "        \"review_summary\": d[\"review_summary\"]\n",
			
 
				-    "    }\n",
			
 
				-    "    items_to_insert.append(new_item)\n",
			
 
				-    "\n",
			
 
				-    "    # Insert every 100 items\n",
			
 
				-    "    if len(items_to_insert) == 100:\n",
			
 
				-    "        collection.data.insert_many(items_to_insert)\n",
			
 
				-    "        items_to_insert.clear()\n",
			
 
				-    "\n",
			
 
				-    "# Insert remaining items\n",
			
 
				-    "if len(items_to_insert) > 0:\n",
			
 
				-    "    collection.data.insert_many(items_to_insert)"
			
 
				-   ]
			
 
				-  },
			
 
				-  {
			
 
				-   "cell_type": "markdown",
			
 
				-   "id": "35079318-41a5-46fc-8475-5d728550fb88",
			
 
				-   "metadata": {},
			
 
				-   "source": [
			
 
				-    "Let's now try to run a hybrid search on the following query below.\n",
			
 
				-    "Hybrid search combines the results of a vector search and a keyword (BM25F) search by fusing the two result sets.\n",
			
 
				-    "It will return the 3 closest entries in the database according to the search criteria."
			
 
				-   ]
			
 
				-  },
			
 
				-  {
			
 
				-   "cell_type": "code",
			
 
				-   "execution_count": null,
			
 
				-   "id": "5f707954-c36b-4a83-874b-f817bd33c39a",
			
 
				-   "metadata": {},
			
 
				-   "outputs": [],
			
 
				-   "source": [
			
 
				-    "# Hybrid search\n",
			
 
				-    "response = collection.query.hybrid(\n",
			
 
				-    "    query=\"easy to learn instrument\",\n",
			
 
				-    "    limit=3\n",
			
 
				-    ")\n",
			
 
				-    "for o in response.objects:\n",
			
 
				-    "    print(o.properties)"
			
 
				-   ]
			
 
				-  },
			
 
				-  {
			
 
				-   "cell_type": "markdown",
			
 
				-   "id": "04d39507-5e8e-4374-a33c-53e57db6ef99",
			
 
				-   "metadata": {},
			
 
				-   "source": [
			
 
				-    "Let's now define a helper function that gives us the relevant context given a string query. Let's see what it returns based on the question: \"What is a good beginner harmonica\""
			
 
				-   ]
			
 
				-  },
			
 
				-  {
			
 
				-   "cell_type": "code",
			
 
				-   "execution_count": null,
			
 
				-   "id": "a1ca51c7-83e5-4896-acc9-753060592ba0",
			
 
				-   "metadata": {},
			
 
				-   "outputs": [],
			
 
				-   "source": [
			
 
				-    "# Helper function to run hybrid search on a user query and return the closest\n",
			
 
				-    "# product review summaries relevant to the user query\n",
			
 
				-    "def get_context(question, limit=3):\n",
			
 
				-    "    response = collection.query.hybrid(\n",
			
 
				-    "        query=question,\n",
			
 
				-    "        limit=limit\n",
			
 
				-    "    )\n",
			
 
				-    "    return \"\\n\".join([str(o.properties) for o in response.objects])\n",
			
 
				-    "\n",
			
 
				-    "print(get_context(\"What is a good beginner harmonica\"))"
			
 
				-   ]
			
 
				-  },
			
 
				-  {
			
 
				-   "cell_type": "markdown",
			
 
				-   "id": "677f534c-8be4-4b6b-82d9-2df8e2ad12d4",
			
 
				-   "metadata": {},
			
 
				-   "source": [
			
 
				-    "Great, we're now ready to build a sales assistant helper function.\n",
			
 
				-    "\n",
			
 
				-    "We first define a prompt template for Llama 3 - based on the context provided by the vector hybrid search (i.e. collection of product summaries of relevance to the question), provide a helpful recommendation to the customer. \n",
			
 
				-    "\n",
			
 
				-    "Also provide links to the product that the user can click on to view the product on Amazon's website. For that we use the fact that any product referenced by its aSIN can be accessed at the following url: `https://www.amazon.com/exec/obidos/ASIN/<insert aSIN here>`"
			
 
				-   ]
			
 
				-  },
			
 
				-  {
			
 
				-   "cell_type": "code",
			
 
				-   "execution_count": null,
			
 
				-   "id": "856d021a-add5-48f4-a09c-258d2a617095",
			
 
				-   "metadata": {},
			
 
				-   "outputs": [],
			
 
				-   "source": [
			
 
				-    "sales_template = \"\"\"\n",
			
 
				-    "You are a sales assistant. Answer the user questions as helpfully as possible.\n",
			
 
				-    "Only recommend the products that are provided in the context provided below.\n",
			
 
				-    "\n",
			
 
				-    "Provide a reference to each product you mention with hyperlinks:\n",
			
 
				-    "* Provide the name of the product\n",
			
 
				-    "* Embed the hyperlink in the name of the product as follows\n",
			
 
				-    "    * If the product name is \"Solid Electric Guitar Case with Accessories Compartment\"\n",
			
 
				-    "    * And the aSIN is \"B001EL6I8W\"\n",
			
 
				-    "    * Format the reference as follows: \n",
			
 
				-    "         [Solid Electric Guitar Case with Accessories Compartment](https://www.amazon.com/exec/obidos/ASIN/B001EL6I8W)\n",
			
 
				-    "\n",
			
 
				-    "Finish with a references section.\n",
			
 
				-    "\n",
			
 
				-    "Customer question: {}\n",
			
 
				-    "\n",
			
 
				-    "Product context: {}\n",
			
 
				-    "\n",
			
 
				-    "AI:\n",
			
 
				-    "\"\"\"\n",
			
 
				-    "\n",
			
 
				-    "def sales_assistant(question):  \n",
			
 
				-    "    response = client.chat.completions.create(\n",
			
 
				-    "                model=\"meta-llama-3-70b-instruct\",\n",
			
 
				-    "                messages=[\n",
			
 
				-    "                    {\"role\": \"system\", \"content\": \"You are a helpful assistant.\"},\n",
			
 
				-    "                    {\"role\": \"user\", \"content\": sales_template.format(question, get_context(question, limit=10))},\n",
			
 
				-    "                ],\n",
			
 
				-    "                temperature=0,\n",
			
 
				-    "                max_tokens=1024\n",
			
 
				-    "            )\n",
			
 
				-    "    \n",
			
 
				-    "    return response.choices[0].message.content\n",
			
 
				-    "\n",
			
 
				-    "print(sales_assistant(\"what is must have accessory for my new electric guitar\"))"
			
 
				-   ]
			
 
				-  },
			
 
				-  {
			
 
				-   "cell_type": "markdown",
			
 
				-   "id": "faccba14-9216-4420-b6c5-ddf4029d7904",
			
 
				-   "metadata": {},
			
 
				-   "source": [
			
 
				-    "# Part 3: Gradio-based sales assistant demo"
			
 
				-   ]
			
 
				-  },
			
 
				-  {
			
 
				-   "cell_type": "markdown",
			
 
				-   "id": "3e2b73b5-6bdf-4c87-b044-2690fd52605f",
			
 
				-   "metadata": {},
			
 
				-   "source": [
			
 
				-    "In this section we build a simple an interactive sales bot assistant using Gradio."
			
 
				-   ]
			
 
				-  },
			
 
				-  {
			
 
				-   "cell_type": "code",
			
 
				-   "execution_count": null,
			
 
				-   "id": "53805acb-3e8d-40fa-8045-c589cb14eadd",
			
 
				-   "metadata": {},
			
 
				-   "outputs": [],
			
 
				-   "source": [
			
 
				-    "import gradio as gr\n",
			
 
				-    "\n",
			
 
				-    "def predict(message, history):\n",
			
 
				-    "    history_openai_format = []\n",
			
 
				-    "    for human, assistant in history:\n",
			
 
				-    "        history_openai_format.append({\"role\": \"user\", \"content\": human})\n",
			
 
				-    "        history_openai_format.append({\"role\": \"assistant\", \"content\": assistant})\n",
			
 
				-    "    history_openai_format.append({\"role\": \"user\", \"content\": sales_template.format(message, get_context(message, limit=5))})\n",
			
 
				-    "\n",
			
 
				-    "    response = client.chat.completions.create(\n",
			
 
				-    "        model = 'meta-llama-3-70b-instruct',\n",
			
 
				-    "        messages = history_openai_format,\n",
			
 
				-    "        temperature = 0.0,\n",
			
 
				-    "        stream = True\n",
			
 
				-    "     )\n",
			
 
				-    "\n",
			
 
				-    "    partial_message = \"\"\n",
			
 
				-    "    for chunk in response:\n",
			
 
				-    "        if chunk.choices[0].delta.content is not None:\n",
			
 
				-    "              partial_message = partial_message + chunk.choices[0].delta.content\n",
			
 
				-    "              yield partial_message\n",
			
 
				-    "\n",
			
 
				-    "gr.ChatInterface(predict).launch()"
			
 
				-   ]
			
 
				-  },
			
 
				-  {
			
 
				-   "cell_type": "markdown",
			
 
				-   "id": "6d4e65fe-0246-40b7-adb6-9091cccbc486",
			
 
				-   "metadata": {},
			
 
				-   "source": [
			
 
				-    "**Authors**\n",
			
 
				-    "- Thierry Moreau, OctoAI - tmoreau@octo.ai\n",
			
 
				-    "- Jonathan Tuite, Weaviate - jon@weaviate.io"
			
 
				-   ]
			
 
				-  }
			
 
				- ],
			
 
				- "metadata": {
			
 
				-  "kernelspec": {
			
 
				-   "display_name": "Python 3 (ipykernel)",
			
 
				-   "language": "python",
			
 
				-   "name": "python3"
			
 
				-  },
			
 
				-  "language_info": {
			
 
				-   "codemirror_mode": {
			
 
				-    "name": "ipython",
			
 
				-    "version": 3
			
 
				-   },
			
 
				-   "file_extension": ".py",
			
 
				-   "mimetype": "text/x-python",
			
 
				-   "name": "python",
			
 
				-   "nbconvert_exporter": "python",
			
 
				-   "pygments_lexer": "ipython3",
			
 
				-   "version": "3.11.6"
			
 
				-  }
			
 
				- },
			
 
				- "nbformat": 4,
			
 
				- "nbformat_minor": 5
			
 
				-}
			
--- a/end-to-end-use-cases/customerservice_chatbots/ai_agent_chatbot/musical_instruments_reviews.csv
+++ b/end-to-end-use-cases/customerservice_chatbots/ai_agent_chatbot/musical_instruments_reviews.csv
--- a/end-to-end-use-cases/live_data.ipynb
+++ b/end-to-end-use-cases/live_data.ipynb
--- a/getting-started/inference/README.md
+++ b/getting-started/inference/README.md
@@ -3,4 +3,3 @@
 
				 This folder contains scripts to get you started with inference on Meta Llama models.
			
 
				 
			
 
				 * [Local Inference](./local_inference/) contains scripts to do memory efficient inference on servers and local machines
			
 
				-* [Mobile Inference](./mobile_inference/) has scripts using MLC to serve Llama on Android (h/t to OctoAI for the contribution!)
			
--- a/getting-started/inference/mobile_inference/android_inference/README.md
+++ b/getting-started/inference/mobile_inference/android_inference/README.md
@@ -1,147 +0,0 @@
 
				-# Running Llama3 8B Instruct on Android with MLC-LLM
			
 
				-
			
 
				-Author: Thierry Moreau - tmoreau@octo.ai
			
 
				-
			
 
				-# Overview
			
 
				-In this tutorial we'll learn how to deploy Llama3 8B Instruct on an Android-based phone using MLC-LLM.
			
 
				-
			
 
				-Machine Learning Compilation for Large Language Models (MLC LLM) is a high-performance universal deployment solution that allows native deployment of any large language models with native APIs with compiler acceleration. The mission of this project is to enable everyone to develop, optimize and deploy AI models natively on everyone's devices with ML compilation techniques.
			
 
				-
			
 
				-You can read more about MLC-LLM at the following [link](https://github.com/mlc-ai/mlc-llm).
			
 
				-
			
 
				-MLC-LLM is also what powers the Llama3 inference APIs provided by [OctoAI](https://octo.ai/). You can use OctoAI for your Llama3 cloud-based inference needs by trying out the examples under the [following path](../../../../3p-integrations/octoai/).
			
 
				-
			
 
				-This tutorial was tested with the following setup:
			
 
				-* MacBook Pro 16 inch from 2021 with Apple M1 Max and 32GB of RAM running Sonoma 14.3.1
			
 
				-* OnePlus 12 Android Smartphone with a Snapdragon 8Gen3 SoC and 12GB or RAM, running OxygenOS 14.0
			
 
				-
			
 
				-Running Llama3 on a phone will likely require a powerful chipset. We haven't tested extensively the range of chipset that will support this usecase. Feel free to update this README.md to specify what devices were successfully tested.
			
 
				-
			
 
				-| Phone      | Chipset          | RAM  | Status  | Comments |
			
 
				-|------------|------------------|------|---------|----------|
			
 
				-| OnePlus 12 | Snapdragon 8Gen3 | 12GB | Success | None     |
			
 
				-|            |                  |      |         |          |
			
 
				-
			
 
				-This guide is heavily based on the [MLC Android Guide](https://llm.mlc.ai/docs/deploy/android.html), but several steps have been taken to streamline the instructions.
			
 
				-
			
 
				-# Pre-requisites
			
 
				-
			
 
				-## Python
			
 
				-
			
 
				-Whether you're using conda or virtual env to manage your environment, we highly recommend starting from scratch with a clean new environment.
			
 
				-
			
 
				-For instance with virtual environment:
			
 
				-```bash
			
 
				-python3 -m venv .venv
			
 
				-source .venv/bin/activate
			
 
				-```
			
 
				-
			
 
				-Next you'll need to install the following packages:
			
 
				-```bash
			
 
				-python3 -m pip install -r requirements.txt
			
 
				-```
			
 
				-
			
 
				-## Rust
			
 
				-
			
 
				-[Rust](https://www.rust-lang.org/tools/install) is needed to cross-compile HuggingFace tokenizers to Android.
			
 
				-Make sure rustc, cargo, and rustup are available in $PATH.
			
 
				-
			
 
				-
			
 
				-## Android Studio
			
 
				-
			
 
				-Install Android Studio from <!-- markdown-link-check-disable -->https://developer.android.com/studio<!-- markdown-link-check-enable --> with NDK and CMake.
			
 
				-
			
 
				-To install NDK and CMake, in the Android Studio welcome page, click “Projects → SDK Manager → SDK Tools”. Set up the following environment variables:
			
 
				-
			
 
				-* ANDROID_NDK so that $ANDROID_NDK/build/cmake/android.toolchain.cmake is available.
			
 
				-* TVM_NDK_CC that points to NDK's clang compiler.
			
 
				-
			
 
				-For instance, the paths will look like the following on OSX for user `moreau`:
			
 
				-```bash
			
 
				-# Android + TVM setup
			
 
				-export ANDROID_NDK="/Users/moreau/Library/Android/sdk/ndk/26.1.10909125"
			
 
				-export TVM_NDK_CC="$ANDROID_NDK/toolchains/llvm/prebuilt/darwin-x86_64/bin/aarch64-linux-android24-clang"
			
 
				-```
			
 
				-
			
 
				-This tutorial was tested successfully on Android Studio Hedgehog | 2023.1.1 Patch 1.
			
 
				-
			
 
				-## JDK
			
 
				-
			
 
				-JDK, such as OpenJDK >= 17, to compile Java bindings of TVM Unity runtime.
			
 
				-
			
 
				-We strongly recommend setting the JAVA_HOME to the JDK bundled with Android Studio. Using Android Studio’s JBR bundle as recommended (<!-- markdown-link-check-disable -->https://developer.android.com/build/jdks<!-- markdown-link-check-enable -->) will reduce the chances of potential errors in JNI compilation.
			
 
				-
			
 
				-For instance on macOS, you'll need to point JAVA_HOME to the following.
			
 
				-
			
 
				-```bash
			
 
				-export JAVA_HOME=/Applications/Android\ Studio.app/Contents/jbr/Contents/Home
			
 
				-```
			
 
				-
			
 
				-To make sure the java binary can be found do an `ls $JAVA_HOME/bin/java`
			
 
				-
			
 
				-## MLC-LLM
			
 
				-
			
 
				-Let's clone mlc-llm from its repo in the directory of your choice:
			
 
				-
			
 
				-```bash
			
 
				-cd /path/to/where/to/clone/repo
			
 
				-git clone https://github.com/mlc-ai/mlc-llm --recursive
			
 
				-export MLC_LLM_HOME=/path/to/mlc-llm
			
 
				-```
			
 
				-
			
 
				-At the time of writing this README, we tested `mlc-llm` at the following sha: `21feb7010db02e0c2149489f5972d6a8a796b5a0`.
			
 
				-
			
 
				-## Phone Setup
			
 
				-
			
 
				-On your phone, enable debugging on your phone in your phone’s developer settings. Each phone manufacturer will have its own approach to enabling debug mode, so a simple Google search should equip you with the steps to do that on your phone.
			
 
				-
			
 
				-In addition, make sure to change your USB configuration from "Charging" to "MTP (Media Transfer Protocol)". This will allow us to connect to the device serially.
			
 
				-
			
 
				-Connect your phone to your development machine. On OSX, you'll be prompted on the dev machine whether you want to allow the accessory to connect. Hit "Allow".
			
 
				-
			
 
				-# Build Steps
			
 
				-
			
 
				-## Building the Android Package with MLC
			
 
				-
			
 
				-First edit the file under `android/MLCChat/mlc-package-config.json` and with the [mlc-package-config.json](./mlc-package-config.json) in llama-cookbook.
			
 
				-
			
 
				-To understand what these JSON fields mean you can refer to this [documentation](https://llm.mlc.ai/docs/deploy/android.html#step-2-build-runtime-and-model-libraries).
			
 
				-
			
 
				-
			
 
				-From the `mlc-llm` project root directory:
			
 
				-
			
 
				-```bash
			
 
				-cd $MLC_LLM_HOME
			
 
				-cd android/MLCChat
			
 
				-python3 -m mlc_llm package  --package-config mlc-package-config.json --output dist
			
 
				-```
			
 
				-
			
 
				-The command above will take a few minutes to run as it runs through the following steps:
			
 
				-
			
 
				-* Compile the Llama 3 8B instruct specified in the `mlc-package-config.json` into a binary model library.
			
 
				-* Build the `mlc-llm` runtime and tokenizer. In addition to the model itself, a lightweight runtime and tokenizer are required to actually run the LLM.
			
 
				-
			
 
				-## Building and Running MLC Chat in Android Studio
			
 
				-
			
 
				-Now let's launch Android Studio.
			
 
				-
			
 
				-* On the "Welcome to Android Studio" page, hit "Open", and navigate to `$MLC_LLM_HOME/android/MLCChat`, then hit "Open"
			
 
				-* A window will pop up asking whether to "Trust and Open project 'MLCChat'" - hit "Trust Project"
			
 
				-* The project will now launch
			
 
				-* Under File -> Project Structure... -> Project change the Gradle Version (second drop down from the top) to 8.5
			
 
				-
			
 
				-Connect your phone to your development machine - assuming you've followed the setup steps in the pre-requisite section, you should be able to see the device.
			
 
				-
			
 
				-Next you'll need to:
			
 
				-
			
 
				-* Hit Build -> Make Project.
			
 
				-* Hit Run -> Run 'app'
			
 
				-
			
 
				-The MLCChat app will launch on your phone, now access your phone:
			
 
				-
			
 
				-* Under Model List you'll see the `Llama-3-8B-Instruct` LLM listed.
			
 
				-* The model's not quite ready to launch yet, because the weights need to be downloaded over Wifi first. Hit the Download button on the right to the model name to download the weights from HuggingFace.
			
 
				-
			
 
				-Note that you can change the build settings to bundle the weights with the MLCChat app so you don't have to download the weights over wifi. To do so you can follow the instructions [here](https://llm.mlc.ai/docs/deploy/android.html#bundle-model-weights).
			
 
				-
			
 
				-Once the model weights are downloaded you can now interact with Llama 3 locally on your Android phone!
			
--- a/getting-started/inference/mobile_inference/android_inference/mlc-package-config.json
+++ b/getting-started/inference/mobile_inference/android_inference/mlc-package-config.json
@@ -1,14 +0,0 @@
 
				-{
			
 
				-    "device": "android",
			
 
				-    "model_list": [
			
 
				-        {
			
 
				-            "model": "HF://mlc-ai/Llama-3-8B-Instruct-q4f16_1-MLC",
			
 
				-            "estimated_vram_bytes": 4348727787,
			
 
				-            "model_id": "Llama-3-8B-Instruct",
			
 
				-            "overrides": {
			
 
				-                "context_window_size": 768,
			
 
				-                "prefill_chunk_size": 256
			
 
				-            }
			
 
				-        }
			
 
				-    ]
			
 
				-}
			
--- a/getting-started/inference/mobile_inference/android_inference/requirements.txt
+++ b/getting-started/inference/mobile_inference/android_inference/requirements.txt
@@ -1,14 +0,0 @@
 
				---pre
			
 
				---find-links https://mlc.ai/wheels
			
 
				-mlc-llm-nightly
			
 
				-mlc-ai-nightly
			
 
				-attrs
			
 
				-decorator
			
 
				-numpy
			
 
				-psutil
			
 
				-pydantic
			
 
				-requests
			
 
				-scipy
			
 
				-setuptools
			
 
				-torch
			
 
				-tqdm
			
--- a/src/llama_cookbook/inference/llm.py
+++ b/src/llama_cookbook/inference/llm.py
@@ -157,37 +157,3 @@ class ANYSCALE(LLM):
 
				             "mistralai/Mistral-7B-Instruct-v0.1",
			
 
				             "HuggingFaceH4/zephyr-7b-beta",
			
 
				         ]
			
 
				-
			
 
				-class OctoAI(LLM):
			
 
				-    """Accessing OctoAI"""
			
 
				-
			
 
				-    def __init__(self, model: str, api_key: str) -> None:
			
 
				-        super().__init__(model, api_key)
			
 
				-        self.client = openai.OpenAI(base_url="https://text.octoai.run/v1", api_key=api_key)  # noqa
			
 
				-
			
 
				-    @override
			
 
				-    def query(self, prompt: str) -> str:
			
 
				-        # Best-level effort to suppress openai log-spew.
			
 
				-        # Likely not work well in multi-threaded environment.
			
 
				-        level = logging.getLogger().level
			
 
				-        logging.getLogger().setLevel(logging.WARNING)
			
 
				-        response = self.client.chat.completions.create(
			
 
				-            model=self.model,
			
 
				-            messages=[
			
 
				-                {"role": "system", "content": "You are a helpful assistant. Keep your responses limited to one short paragraph if possible."},
			
 
				-                {"role": "user", "content": prompt},
			
 
				-            ],
			
 
				-            max_tokens=MAX_TOKENS,
			
 
				-            temperature=TEMPERATURE,
			
 
				-            top_p=TOP_P,
			
 
				-        )
			
 
				-        logging.getLogger().setLevel(level)
			
 
				-        return response.choices[0].message.content
			
 
				-
			
 
				-    @override
			
 
				-    def valid_models(self) -> list[str]:
			
 
				-        return [
			
 
				-            "llamaguard-2-8b",
			
 
				-            "meta-llama-3-8b-instruct",
			
 
				-            "meta-llama-3-70b-instruct",        
			
 
				-        ]