%!s(int64=2) %!d(string=hai) anos · fb7dd3a327
--- a/recipes/README.md
+++ b/recipes/README.md
--- a/recipes/inference/llama_web_ui/Llama2_Gradio.ipynb
+++ b/recipes/inference/llama_web_ui/Llama2_Gradio.ipynb
@@ -1,130 +0,0 @@
 
				-{
			
 
				- "cells": [
			
 
				-  {
			
 
				-   "cell_type": "code",
			
 
				-   "execution_count": null,
			
 
				-   "id": "e4532411",
			
 
				-   "metadata": {},
			
 
				-   "outputs": [],
			
 
				-   "source": [
			
 
				-    "# TODO REFACTOR: Integrate code from _legacy/inference.py into this notebook"
			
 
				-   ]
			
 
				-  },
			
 
				-  {
			
 
				-   "cell_type": "markdown",
			
 
				-   "id": "47a9adb3",
			
 
				-   "metadata": {},
			
 
				-   "source": [
			
 
				-    "## This demo app shows how to query Llama 2 using the Gradio UI.\n",
			
 
				-    "\n",
			
 
				-    "Since we are using Replicate in this example, you will need to replace `<your replicate api token>` with your API token.\n",
			
 
				-    "\n",
			
 
				-    "To get the Replicate token: \n",
			
 
				-    "\n",
			
 
				-    "- You will need to first sign in with Replicate with your github account\n",
			
 
				-    "- Then create a free API token [here](https://replicate.com/account/api-tokens) that you can use for a while \n",
			
 
				-    "\n",
			
 
				-    "**Note** After the free trial ends, you will need to enter billing info to continue to use Llama2 hosted on Replicate.\n",
			
 
				-    "\n",
			
 
				-    "To run this example:\n",
			
 
				-    "- Set up your Replicate API token and enter it in place of `<your replicate api token>`\n",
			
 
				-    "- Run the notebook\n",
			
 
				-    "- Enter your question and click Submit\n",
			
 
				-    "\n",
			
 
				-    "In the notebook or a browser with URL http://127.0.0.1:7860 you should see a UI with your answer."
			
 
				-   ]
			
 
				-  },
			
 
				-  {
			
 
				-   "cell_type": "code",
			
 
				-   "execution_count": 1,
			
 
				-   "id": "928041cc",
			
 
				-   "metadata": {},
			
 
				-   "outputs": [
			
 
				-    {
			
 
				-     "name": "stderr",
			
 
				-     "output_type": "stream",
			
 
				-     "text": [
			
 
				-      "Init param `input` is deprecated, please use `model_kwargs` instead.\n"
			
 
				-     ]
			
 
				-    },
			
 
				-    {
			
 
				-     "name": "stdout",
			
 
				-     "output_type": "stream",
			
 
				-     "text": [
			
 
				-      "Running on local URL:  http://127.0.0.1:7860\n",
			
 
				-      "\n",
			
 
				-      "To create a public link, set `share=True` in `launch()`.\n"
			
 
				-     ]
			
 
				-    },
			
 
				-    {
			
 
				-     "data": {
			
 
				-      "text/html": [
			
 
				-       "<div><iframe src=\"http://127.0.0.1:7860/\" width=\"100%\" height=\"500\" allow=\"autoplay; camera; microphone; clipboard-read; clipboard-write;\" frameborder=\"0\" allowfullscreen></iframe></div>"
			
 
				-      ],
			
 
				-      "text/plain": [
			
 
				-       "<IPython.core.display.HTML object>"
			
 
				-      ]
			
 
				-     },
			
 
				-     "metadata": {},
			
 
				-     "output_type": "display_data"
			
 
				-    },
			
 
				-    {
			
 
				-     "data": {
			
 
				-      "text/plain": []
			
 
				-     },
			
 
				-     "execution_count": 1,
			
 
				-     "metadata": {},
			
 
				-     "output_type": "execute_result"
			
 
				-    }
			
 
				-   ],
			
 
				-   "source": [
			
 
				-    "from langchain.schema import AIMessage, HumanMessage\n",
			
 
				-    "import gradio as gr\n",
			
 
				-    "from langchain.llms import Replicate\n",
			
 
				-    "import os\n",
			
 
				-    "\n",
			
 
				-    "os.environ[\"REPLICATE_API_TOKEN\"] = \"<your replicate api token>\"\n",
			
 
				-    "\n",
			
 
				-    "llama2_13b_chat = \"meta/llama-2-13b-chat:f4e2de70d66816a838a89eeeb621910adffb0dd0baba3976c96980970978018d\"\n",
			
 
				-    "\n",
			
 
				-    "llm = Replicate(\n",
			
 
				-    "    model=llama2_13b_chat,\n",
			
 
				-    "    model_kwargs={\"temperature\": 0.01, \"top_p\": 1, \"max_new_tokens\":500}\n",
			
 
				-    ")\n",
			
 
				-    "\n",
			
 
				-    "\n",
			
 
				-    "def predict(message, history):\n",
			
 
				-    "    history_langchain_format = []\n",
			
 
				-    "    for human, ai in history:\n",
			
 
				-    "        history_langchain_format.append(HumanMessage(content=human))\n",
			
 
				-    "        history_langchain_format.append(AIMessage(content=ai))\n",
			
 
				-    "    history_langchain_format.append(HumanMessage(content=message))\n",
			
 
				-    "    gpt_response = llm(message) #history_langchain_format)\n",
			
 
				-    "    return gpt_response#.content\n",
			
 
				-    "\n",
			
 
				-    "gr.ChatInterface(predict).launch()"
			
 
				-   ]
			
 
				-  }
			
 
				- ],
			
 
				- "metadata": {
			
 
				-  "kernelspec": {
			
 
				-   "display_name": "Python 3 (ipykernel)",
			
 
				-   "language": "python",
			
 
				-   "name": "python3"
			
 
				-  },
			
 
				-  "language_info": {
			
 
				-   "codemirror_mode": {
			
 
				-    "name": "ipython",
			
 
				-    "version": 3
			
 
				-   },
			
 
				-   "file_extension": ".py",
			
 
				-   "mimetype": "text/x-python",
			
 
				-   "name": "python",
			
 
				-   "nbconvert_exporter": "python",
			
 
				-   "pygments_lexer": "ipython3",
			
 
				-   "version": "3.8.18"
			
 
				-  }
			
 
				- },
			
 
				- "nbformat": 4,
			
 
				- "nbformat_minor": 5
			
 
				-}
			
--- a/recipes/inference/llama_web_ui/README.md
+++ b/recipes/inference/llama_web_ui/README.md
@@ -1,25 +0,0 @@
 
				-## Quick Web UI for Llama2 Chat
			
 
				-If you prefer to see Llama2 in action in a web UI, instead of the notebooks above, you can try one of the two methods:
			
 
				-
			
 
				-### Running [Streamlit](https://streamlit.io/) with Llama2
			
 
				-Open a Terminal, run the following commands:
			
 
				-```
			
 
				-pip install streamlit langchain replicate
			
 
				-git clone https://github.com/facebookresearch/llama-recipes
			
 
				-cd llama-recipes/llama-demo-apps
			
 
				-```
			
 
				-
			
 
				-Replace the `<your replicate api token>` in `streamlit_llama2.py` with your API token created [here](https://replicate.com/account/api-tokens) - for more info, see the note [above](#replicate_note).
			
 
				-
			
 
				-Then run the command `streamlit run streamlit_llama2.py` and you'll see on your browser the following UI with question and answer - you can enter new text question, click Submit, and see Llama2's answer:
			
 
				-
			
 
				-![](../../../docs/images/llama2-streamlit.png)
			
 
				-![](../../../docs/images/llama2-streamlit2.png)
			
 
				-
			
 
				-### Running [Gradio](https://www.gradio.app/) with Llama2 (using [Replicate](Llama2_Gradio.ipynb) or [OctoAI](../../llama_api_providers/OctoAI_API_examples/Llama2_Gradio.ipynb))
			
 
				-
			
 
				-To see how to query Llama2 and get answers with the Gradio UI both from the notebook and web, just launch the notebook `Llama2_Gradio.ipynb`. For more info, on how to get set up with a token to power these apps, see the note on [Replicate](../../README.md#replicate_note) and [OctoAI](../../README.md##octoai_note).
			
 
				-
			
 
				-Then enter your question, click Submit. You'll see in the notebook or a browser with URL http://127.0.0.1:7860 the following UI:
			
 
				-
			
 
				-![](../../../docs/images/llama2-gradio.png)
			
--- a/recipes/inference/llama_web_ui/requirements.txt
+++ b/recipes/inference/llama_web_ui/requirements.txt
@@ -1,3 +0,0 @@
 
				-streamlit
			
 
				-langchain
			
 
				-replicate
			
--- a/recipes/inference/llama_web_ui/streamlit_llama2.py
+++ b/recipes/inference/llama_web_ui/streamlit_llama2.py
@@ -1,27 +0,0 @@
 
				-# Copyright (c) Meta Platforms, Inc. and affiliates.
			
 
				-# This software may be used and distributed according to the terms of the Llama 2 Community License Agreement.
			
 
				-
			
 
				-# TODO REFACTOR: Convert this to an ipynb notebook
			
 
				-
			
 
				-import streamlit as st
			
 
				-from langchain.llms import Replicate
			
 
				-import os
			
 
				-
			
 
				-st.title("Llama2-powered Streamlit App")
			
 
				-
			
 
				-with st.sidebar:
			
 
				-    os.environ["REPLICATE_API_TOKEN"] = "<your replicate api token>"
			
 
				-
			
 
				-def generate_response(input_text):
			
 
				-    llama2_13b_chat = "meta/llama-2-13b-chat:f4e2de70d66816a838a89eeeb621910adffb0dd0baba3976c96980970978018d"
			
 
				-
			
 
				-    llm = Replicate(
			
 
				-        model=llama2_13b_chat,
			
 
				-        model_kwargs={"temperature": 0.01, "top_p": 1, "max_new_tokens":500}
			
 
				-    )
			
 
				-    st.info(llm(input_text))
			
 
				-
			
 
				-with st.form("my_form"):
			
 
				-    text = st.text_area("Enter text:", "What is Generative AI?")
			
 
				-    submitted = st.form_submit_button("Submit")
			
 
				-    generate_response(text)
			
--- a/recipes/inference/model_servers/README.md
+++ b/recipes/inference/model_servers/README.md
@@ -1,4 +1,2 @@
 
				-## [Running Llama2 On-Prem with vLLM and TGI](llama-on-prem.md)
			
 
				-This tutorial shows how to use Llama 2 with [vLLM](https://github.com/vllm-project/vllm) and Hugging Face [TGI](https://github.com/huggingface/text-generation-inference) to build Llama 2 on-prem apps.
			
 
				-
			
 
				-\* To run a quantized Llama2 model on iOS and Android, you can use  the open source [MLC LLM](https://github.com/mlc-ai/mlc-llm) or [llama.cpp](https://github.com/ggerganov/llama.cpp). You can even make a Linux OS that boots to Llama2 ([repo](https://github.com/trholding/llama2.c)).
			
 
				+## [Running Llama 3 On-Prem with vLLM and TGI](llama-on-prem.md)
			
 
				+This tutorial shows how to use Llama 3 with [vLLM](https://github.com/vllm-project/vllm) and Hugging Face [TGI](https://github.com/huggingface/text-generation-inference) to build Llama 3 on-prem apps.