1 年之前 · b9bd30c4d0
--- a/docs/multi_gpu.md
+++ b/docs/multi_gpu.md
@@ -24,7 +24,7 @@ This runs with the `samsum_dataset` for summarization application by default.
 
				 
			
 
				 ```bash
			
 
				 
			
 
				-torchrun --nnodes 1 --nproc_per_node 4  examples/finetuning.py --enable_fsdp --model_name /patht_of_model_folder/8B --use_peft --peft_method lora --output_dir Path/to/save/PEFT/model
			
 
				+torchrun --nnodes 1 --nproc_per_node 4  examples/finetuning.py --enable_fsdp --model_name /path_of_model_folder/8B --use_peft --peft_method lora --output_dir Path/to/save/PEFT/model
			
 
				 
			
 
				 ```
			
 
				 
			
@@ -43,7 +43,7 @@ We use `torchrun` here to spawn multiple processes for FSDP.
 
				 Setting `use_fast_kernels` will enable using of Flash Attention or Xformer memory-efficient kernels based on the hardware being used. This would speed up the fine-tuning job. This has been enabled in `optimum` library from HuggingFace as a one-liner API, please read more [here](https://pytorch.org/blog/out-of-the-box-acceleration/).
			
 
				 
			
 
				 ```bash
			
 
				-torchrun --nnodes 1 --nproc_per_node 4  examples/finetuning.py --enable_fsdp --model_name /patht_of_model_folder/8B --use_peft --peft_method lora --output_dir Path/to/save/PEFT/model --use_fast_kernels
			
 
				+torchrun --nnodes 1 --nproc_per_node 4  examples/finetuning.py --enable_fsdp --model_name /path_of_model_folder/8B --use_peft --peft_method lora --output_dir Path/to/save/PEFT/model --use_fast_kernels
			
 
				 ```
			
 
				 
			
 
				 ### Fine-tuning using FSDP Only
			
@@ -52,7 +52,7 @@ If interested in running full parameter finetuning without making use of PEFT me
 
				 
			
 
				 ```bash
			
 
				 
			
 
				-torchrun --nnodes 1 --nproc_per_node 8  examples/finetuning.py --enable_fsdp --model_name /patht_of_model_folder/8B --dist_checkpoint_root_folder model_checkpoints --dist_checkpoint_folder fine-tuned --pure_bf16 --use_fast_kernels
			
 
				+torchrun --nnodes 1 --nproc_per_node 8  examples/finetuning.py --enable_fsdp --model_name /path_of_model_folder/8B --dist_checkpoint_root_folder model_checkpoints --dist_checkpoint_folder fine-tuned --pure_bf16 --use_fast_kernels
			
 
				 
			
 
				 ```
			
 
				 
			
@@ -62,7 +62,7 @@ If you are interested in running full parameter fine-tuning on the 70B model, yo
 
				 
			
 
				 ```bash
			
 
				 
			
 
				-torchrun --nnodes 1 --nproc_per_node 8 examples/finetuning.py --enable_fsdp --low_cpu_fsdp --pure_bf16 --model_name /patht_of_model_folder/70B --batch_size_training 1 --dist_checkpoint_root_folder model_checkpoints --dist_checkpoint_folder fine-tuned
			
 
				+torchrun --nnodes 1 --nproc_per_node 8 examples/finetuning.py --enable_fsdp --low_cpu_fsdp --pure_bf16 --model_name /path_of_model_folder/70B --batch_size_training 1 --dist_checkpoint_root_folder model_checkpoints --dist_checkpoint_folder fine-tuned
			
 
				 
			
 
				 ```
			
 
				 
			
@@ -95,16 +95,16 @@ To run with each of the datasets set the `dataset` flag in the command as shown
 
				 
			
 
				 ```bash
			
 
				 # grammer_dataset
			
 
				-torchrun --nnodes 1 --nproc_per_node 4  examples/finetuning.py --enable_fsdp  --model_name /patht_of_model_folder/8B --use_peft --peft_method lora --dataset grammar_dataset --save_model --dist_checkpoint_root_folder model_checkpoints --dist_checkpoint_folder fine-tuned  --pure_bf16 --output_dir Path/to/save/PEFT/model
			
 
				+torchrun --nnodes 1 --nproc_per_node 4  examples/finetuning.py --enable_fsdp  --model_name /path_of_model_folder/8B --use_peft --peft_method lora --dataset grammar_dataset --save_model --dist_checkpoint_root_folder model_checkpoints --dist_checkpoint_folder fine-tuned  --pure_bf16 --output_dir Path/to/save/PEFT/model
			
 
				 
			
 
				 # alpaca_dataset
			
 
				 
			
 
				-torchrun --nnodes 1 --nproc_per_node 4  examples/finetuning.py --enable_fsdp  --model_name /patht_of_model_folder/8B --use_peft --peft_method lora --dataset alpaca_dataset --save_model --dist_checkpoint_root_folder model_checkpoints --dist_checkpoint_folder fine-tuned --pure_bf16 --output_dir Path/to/save/PEFT/model
			
 
				+torchrun --nnodes 1 --nproc_per_node 4  examples/finetuning.py --enable_fsdp  --model_name /path_of_model_folder/8B --use_peft --peft_method lora --dataset alpaca_dataset --save_model --dist_checkpoint_root_folder model_checkpoints --dist_checkpoint_folder fine-tuned --pure_bf16 --output_dir Path/to/save/PEFT/model
			
 
				 
			
 
				 
			
 
				 # samsum_dataset
			
 
				 
			
 
				-torchrun --nnodes 1 --nproc_per_node 4  examples/finetuning.py --enable_fsdp --model_name /patht_of_model_folder/8B --use_peft --peft_method lora --dataset samsum_dataset --save_model --dist_checkpoint_root_folder model_checkpoints --dist_checkpoint_folder fine-tuned --pure_bf16 --output_dir Path/to/save/PEFT/model
			
 
				+torchrun --nnodes 1 --nproc_per_node 4  examples/finetuning.py --enable_fsdp --model_name /path_of_model_folder/8B --use_peft --peft_method lora --dataset samsum_dataset --save_model --dist_checkpoint_root_folder model_checkpoints --dist_checkpoint_folder fine-tuned --pure_bf16 --output_dir Path/to/save/PEFT/model
			
 
				 
			
 
				 ```
			
 
				 
			
--- a/docs/single_gpu.md
+++ b/docs/single_gpu.md
@@ -20,7 +20,7 @@ Get access to a machine with one GPU or if using a multi-GPU machine please make
 
				 
			
 
				 ```bash
			
 
				 
			
 
				-python -m llama_recipes.finetuning  --use_peft --peft_method lora --quantization --use_fp16 --model_name /patht_of_model_folder/8B --output_dir Path/to/save/PEFT/model
			
 
				+python -m llama_recipes.finetuning  --use_peft --peft_method lora --quantization --use_fp16 --model_name /path_of_model_folder/8B --output_dir Path/to/save/PEFT/model
			
 
				 
			
 
				 ```
			
 
				 The args used in the command above are:
			
@@ -51,16 +51,16 @@ to run with each of the datasets set the `dataset` flag in the command as shown
 
				 ```bash
			
 
				 # grammer_dataset
			
 
				 
			
 
				-python -m llama_recipes.finetuning  --use_peft --peft_method lora --quantization  --dataset grammar_dataset --model_name /patht_of_model_folder/8B --output_dir Path/to/save/PEFT/model
			
 
				+python -m llama_recipes.finetuning  --use_peft --peft_method lora --quantization  --dataset grammar_dataset --model_name /path_of_model_folder/8B --output_dir Path/to/save/PEFT/model
			
 
				 
			
 
				 # alpaca_dataset
			
 
				 
			
 
				-python -m llama_recipes.finetuning  --use_peft --peft_method lora --quantization  --dataset alpaca_dataset --model_name /patht_of_model_folder/8B --output_dir Path/to/save/PEFT/model
			
 
				+python -m llama_recipes.finetuning  --use_peft --peft_method lora --quantization  --dataset alpaca_dataset --model_name /path_of_model_folder/8B --output_dir Path/to/save/PEFT/model
			
 
				 
			
 
				 
			
 
				 # samsum_dataset
			
 
				 
			
 
				-python -m llama_recipes.finetuning  --use_peft --peft_method lora --quantization  --dataset samsum_dataset --model_name /patht_of_model_folder/8B --output_dir Path/to/save/PEFT/model
			
 
				+python -m llama_recipes.finetuning  --use_peft --peft_method lora --quantization  --dataset samsum_dataset --model_name /path_of_model_folder/8B --output_dir Path/to/save/PEFT/model
			
 
				 
			
 
				 ```
			
 
				 
			
--- a/recipes/README.md
+++ b/recipes/README.md
--- a/recipes/finetuning/README.md
+++ b/recipes/finetuning/README.md
@@ -99,7 +99,7 @@ It lets us specify the training settings for everything from `model_name` to `da
 
				 You can enable [W&B](https://wandb.ai/) experiment tracking by using `use_wandb` flag as below. You can change the project name, entity and other `wandb.init` arguments in `wandb_config`.
			
 
				 
			
 
				 ```bash
			
 
				-python -m llama_recipes.finetuning --use_peft --peft_method lora --quantization --model_name /patht_of_model_folder/8B --output_dir Path/to/save/PEFT/model --use_wandb
			
 
				+python -m llama_recipes.finetuning --use_peft --peft_method lora --quantization --model_name /path_of_model_folder/8B --output_dir Path/to/save/PEFT/model --use_wandb
			
 
				 ```
			
 
				 You'll be able to access a dedicated project or run link on [wandb.ai](https://wandb.ai) and see your dashboard like the one below.
			
 
				 <div style="display: flex;">
			
--- a/recipes/finetuning/multigpu_finetuning.md
+++ b/recipes/finetuning/multigpu_finetuning.md
@@ -23,7 +23,7 @@ Get access to a machine with multiple GPUs (in this case we tested with 4 A100 a
 
				 <details open>
			
 
				 <summary>Single-node Multi-GPU</summary>
			
 
				 
			
 
				-    torchrun --nnodes 1 --nproc_per_node 4  finetuning.py --enable_fsdp --model_name /patht_of_model_folder/8B --use_peft --peft_method lora --output_dir Path/to/save/PEFT/model
			
 
				+    torchrun --nnodes 1 --nproc_per_node 4  finetuning.py --enable_fsdp --model_name /path_of_model_folder/8B --use_peft --peft_method lora --output_dir Path/to/save/PEFT/model
			
 
				 
			
 
				 </details>
			
 
				 
			
@@ -49,7 +49,7 @@ The args used in the command above are:
 
				 If interested in running full parameter finetuning without making use of PEFT methods, please use the following command. Make sure to change the `nproc_per_node` to your available GPUs. This has been tested with `BF16` on 8xA100, 40GB GPUs.
			
 
				 
			
 
				 ```bash
			
 
				-torchrun --nnodes 1 --nproc_per_node 8  finetuning.py --enable_fsdp --model_name /patht_of_model_folder/8B --dist_checkpoint_root_folder model_checkpoints --dist_checkpoint_folder fine-tuned --pure_bf16 --use_fast_kernels
			
 
				+torchrun --nnodes 1 --nproc_per_node 8  finetuning.py --enable_fsdp --model_name /path_of_model_folder/8B --dist_checkpoint_root_folder model_checkpoints --dist_checkpoint_folder fine-tuned --pure_bf16 --use_fast_kernels
			
 
				 ```
			
 
				 
			
 
				 ### Using less CPU memory (FSDP on 70B model)
			
@@ -57,7 +57,7 @@ torchrun --nnodes 1 --nproc_per_node 8  finetuning.py --enable_fsdp --model_name
 
				 If you are running full parameter fine-tuning on the 70B model, you can enable `low_cpu_fsdp` mode as the following command. This option will load model on rank0 only before moving model to devices to construct FSDP. This can dramatically save cpu memory when loading large models like 70B (on a 8-gpu node, this reduces cpu memory from 2+T to 280G for 70B model). This has been tested with `BF16` on 16xA100, 80GB GPUs.
			
 
				 
			
 
				 ```bash
			
 
				-torchrun --nnodes 1 --nproc_per_node 8 finetuning.py --enable_fsdp --low_cpu_fsdp --pure_bf16 --model_name /patht_of_model_folder/70B --batch_size_training 1 --dist_checkpoint_root_folder model_checkpoints --dist_checkpoint_folder fine-tuned
			
 
				+torchrun --nnodes 1 --nproc_per_node 8 finetuning.py --enable_fsdp --low_cpu_fsdp --pure_bf16 --model_name /path_of_model_folder/70B --batch_size_training 1 --dist_checkpoint_root_folder model_checkpoints --dist_checkpoint_folder fine-tuned
			
 
				 ```
			
 
				 
			
 
				 
			
@@ -79,16 +79,16 @@ To run with each of the datasets set the `dataset` flag in the command as shown
 
				 
			
 
				 ```bash
			
 
				 # grammer_dataset
			
 
				-torchrun --nnodes 1 --nproc_per_node 4  finetuning.py --enable_fsdp  --model_name /patht_of_model_folder/8B --use_peft --peft_method lora --dataset grammar_dataset --save_model --dist_checkpoint_root_folder model_checkpoints --dist_checkpoint_folder fine-tuned  --pure_bf16 --output_dir Path/to/save/PEFT/model
			
 
				+torchrun --nnodes 1 --nproc_per_node 4  finetuning.py --enable_fsdp  --model_name /path_of_model_folder/8B --use_peft --peft_method lora --dataset grammar_dataset --save_model --dist_checkpoint_root_folder model_checkpoints --dist_checkpoint_folder fine-tuned  --pure_bf16 --output_dir Path/to/save/PEFT/model
			
 
				 
			
 
				 # alpaca_dataset
			
 
				 
			
 
				-torchrun --nnodes 1 --nproc_per_node 4  finetuning.py --enable_fsdp  --model_name /patht_of_model_folder/8B --use_peft --peft_method lora --dataset alpaca_dataset --save_model --dist_checkpoint_root_folder model_checkpoints --dist_checkpoint_folder fine-tuned --pure_bf16 --output_dir Path/to/save/PEFT/model
			
 
				+torchrun --nnodes 1 --nproc_per_node 4  finetuning.py --enable_fsdp  --model_name /path_of_model_folder/8B --use_peft --peft_method lora --dataset alpaca_dataset --save_model --dist_checkpoint_root_folder model_checkpoints --dist_checkpoint_folder fine-tuned --pure_bf16 --output_dir Path/to/save/PEFT/model
			
 
				 
			
 
				 
			
 
				 # samsum_dataset
			
 
				 
			
 
				-torchrun --nnodes 1 --nproc_per_node 4  finetuning.py --enable_fsdp --model_name /patht_of_model_folder/8B --use_peft --peft_method lora --dataset samsum_dataset --save_model --dist_checkpoint_root_folder model_checkpoints --dist_checkpoint_folder fine-tuned --pure_bf16 --output_dir Path/to/save/PEFT/model
			
 
				+torchrun --nnodes 1 --nproc_per_node 4  finetuning.py --enable_fsdp --model_name /path_of_model_folder/8B --use_peft --peft_method lora --dataset samsum_dataset --save_model --dist_checkpoint_root_folder model_checkpoints --dist_checkpoint_folder fine-tuned --pure_bf16 --output_dir Path/to/save/PEFT/model
			
 
				 
			
 
				 ```
			
 
				 
			
@@ -103,7 +103,7 @@ This will require to set the Sharding strategy in [fsdp config](../../src/llama_
 
				 
			
 
				 ```bash
			
 
				 
			
 
				-torchrun --nnodes 4 --nproc_per_node 8 ./finetuning.py --enable_fsdp --low_cpu_fsdp --fsdp_config.pure_bf16 --model_name /patht_of_model_folder/70B --batch_size_training 1 --dist_checkpoint_root_folder model_checkpoints --dist_checkpoint_folder fine-tuned --hsdp --sharding_group_size n --replica_group_size world_size/n
			
 
				+torchrun --nnodes 4 --nproc_per_node 8 ./finetuning.py --enable_fsdp --low_cpu_fsdp --fsdp_config.pure_bf16 --model_name /path_of_model_folder/70B --batch_size_training 1 --dist_checkpoint_root_folder model_checkpoints --dist_checkpoint_folder fine-tuned --hsdp --sharding_group_size n --replica_group_size world_size/n
			
 
				 
			
 
				 ```
			
 
				 
			
--- a/recipes/finetuning/singlegpu_finetuning.md
+++ b/recipes/finetuning/singlegpu_finetuning.md
@@ -16,7 +16,7 @@ To run fine-tuning on a single GPU, we will make use of two packages:
 
				 ## How to run it?
			
 
				 
			
 
				 ```bash
			
 
				-python -m finetuning.py  --use_peft --peft_method lora --quantization --use_fp16 --model_name /patht_of_model_folder/8B --output_dir Path/to/save/PEFT/model
			
 
				+python -m finetuning.py  --use_peft --peft_method lora --quantization --use_fp16 --model_name /path_of_model_folder/8B --output_dir Path/to/save/PEFT/model
			
 
				 ```
			
 
				 The args used in the command above are:
			
 
				 
			
@@ -48,16 +48,16 @@ to run with each of the datasets set the `dataset` flag in the command as shown
 
				 ```bash
			
 
				 # grammer_dataset
			
 
				 
			
 
				-python -m finetuning.py  --use_peft --peft_method lora --quantization  --dataset grammar_dataset --model_name /patht_of_model_folder/8B --output_dir Path/to/save/PEFT/model
			
 
				+python -m finetuning.py  --use_peft --peft_method lora --quantization  --dataset grammar_dataset --model_name /path_of_model_folder/8B --output_dir Path/to/save/PEFT/model
			
 
				 
			
 
				 # alpaca_dataset
			
 
				 
			
 
				-python -m finetuning.py  --use_peft --peft_method lora --quantization  --dataset alpaca_dataset --model_name /patht_of_model_folder/8B --output_dir Path/to/save/PEFT/model
			
 
				+python -m finetuning.py  --use_peft --peft_method lora --quantization  --dataset alpaca_dataset --model_name /path_of_model_folder/8B --output_dir Path/to/save/PEFT/model
			
 
				 
			
 
				 
			
 
				 # samsum_dataset
			
 
				 
			
 
				-python -m finetuning.py  --use_peft --peft_method lora --quantization  --dataset samsum_dataset --model_name /patht_of_model_folder/8B --output_dir Path/to/save/PEFT/model
			
 
				+python -m finetuning.py  --use_peft --peft_method lora --quantization  --dataset samsum_dataset --model_name /path_of_model_folder/8B --output_dir Path/to/save/PEFT/model
			
 
				 
			
 
				 ```
			
 
				 
			
--- a/recipes/inference/llama_web_ui/Llama2_Gradio.ipynb
+++ b/recipes/inference/llama_web_ui/Llama2_Gradio.ipynb
@@ -1,130 +0,0 @@
 
				-{
			
 
				- "cells": [
			
 
				-  {
			
 
				-   "cell_type": "code",
			
 
				-   "execution_count": null,
			
 
				-   "id": "e4532411",
			
 
				-   "metadata": {},
			
 
				-   "outputs": [],
			
 
				-   "source": [
			
 
				-    "# TODO REFACTOR: Integrate code from _legacy/inference.py into this notebook"
			
 
				-   ]
			
 
				-  },
			
 
				-  {
			
 
				-   "cell_type": "markdown",
			
 
				-   "id": "47a9adb3",
			
 
				-   "metadata": {},
			
 
				-   "source": [
			
 
				-    "## This demo app shows how to query Llama 2 using the Gradio UI.\n",
			
 
				-    "\n",
			
 
				-    "Since we are using Replicate in this example, you will need to replace `<your replicate api token>` with your API token.\n",
			
 
				-    "\n",
			
 
				-    "To get the Replicate token: \n",
			
 
				-    "\n",
			
 
				-    "- You will need to first sign in with Replicate with your github account\n",
			
 
				-    "- Then create a free API token [here](https://replicate.com/account/api-tokens) that you can use for a while \n",
			
 
				-    "\n",
			
 
				-    "**Note** After the free trial ends, you will need to enter billing info to continue to use Llama2 hosted on Replicate.\n",
			
 
				-    "\n",
			
 
				-    "To run this example:\n",
			
 
				-    "- Set up your Replicate API token and enter it in place of `<your replicate api token>`\n",
			
 
				-    "- Run the notebook\n",
			
 
				-    "- Enter your question and click Submit\n",
			
 
				-    "\n",
			
 
				-    "In the notebook or a browser with URL http://127.0.0.1:7860 you should see a UI with your answer."
			
 
				-   ]
			
 
				-  },
			
 
				-  {
			
 
				-   "cell_type": "code",
			
 
				-   "execution_count": 1,
			
 
				-   "id": "928041cc",
			
 
				-   "metadata": {},
			
 
				-   "outputs": [
			
 
				-    {
			
 
				-     "name": "stderr",
			
 
				-     "output_type": "stream",
			
 
				-     "text": [
			
 
				-      "Init param `input` is deprecated, please use `model_kwargs` instead.\n"
			
 
				-     ]
			
 
				-    },
			
 
				-    {
			
 
				-     "name": "stdout",
			
 
				-     "output_type": "stream",
			
 
				-     "text": [
			
 
				-      "Running on local URL:  http://127.0.0.1:7860\n",
			
 
				-      "\n",
			
 
				-      "To create a public link, set `share=True` in `launch()`.\n"
			
 
				-     ]
			
 
				-    },
			
 
				-    {
			
 
				-     "data": {
			
 
				-      "text/html": [
			
 
				-       "<div><iframe src=\"http://127.0.0.1:7860/\" width=\"100%\" height=\"500\" allow=\"autoplay; camera; microphone; clipboard-read; clipboard-write;\" frameborder=\"0\" allowfullscreen></iframe></div>"
			
 
				-      ],
			
 
				-      "text/plain": [
			
 
				-       "<IPython.core.display.HTML object>"
			
 
				-      ]
			
 
				-     },
			
 
				-     "metadata": {},
			
 
				-     "output_type": "display_data"
			
 
				-    },
			
 
				-    {
			
 
				-     "data": {
			
 
				-      "text/plain": []
			
 
				-     },
			
 
				-     "execution_count": 1,
			
 
				-     "metadata": {},
			
 
				-     "output_type": "execute_result"
			
 
				-    }
			
 
				-   ],
			
 
				-   "source": [
			
 
				-    "from langchain.schema import AIMessage, HumanMessage\n",
			
 
				-    "import gradio as gr\n",
			
 
				-    "from langchain.llms import Replicate\n",
			
 
				-    "import os\n",
			
 
				-    "\n",
			
 
				-    "os.environ[\"REPLICATE_API_TOKEN\"] = \"<your replicate api token>\"\n",
			
 
				-    "\n",
			
 
				-    "llama2_13b_chat = \"meta/llama-2-13b-chat:f4e2de70d66816a838a89eeeb621910adffb0dd0baba3976c96980970978018d\"\n",
			
 
				-    "\n",
			
 
				-    "llm = Replicate(\n",
			
 
				-    "    model=llama2_13b_chat,\n",
			
 
				-    "    model_kwargs={\"temperature\": 0.01, \"top_p\": 1, \"max_new_tokens\":500}\n",
			
 
				-    ")\n",
			
 
				-    "\n",
			
 
				-    "\n",
			
 
				-    "def predict(message, history):\n",
			
 
				-    "    history_langchain_format = []\n",
			
 
				-    "    for human, ai in history:\n",
			
 
				-    "        history_langchain_format.append(HumanMessage(content=human))\n",
			
 
				-    "        history_langchain_format.append(AIMessage(content=ai))\n",
			
 
				-    "    history_langchain_format.append(HumanMessage(content=message))\n",
			
 
				-    "    gpt_response = llm(message) #history_langchain_format)\n",
			
 
				-    "    return gpt_response#.content\n",
			
 
				-    "\n",
			
 
				-    "gr.ChatInterface(predict).launch()"
			
 
				-   ]
			
 
				-  }
			
 
				- ],
			
 
				- "metadata": {
			
 
				-  "kernelspec": {
			
 
				-   "display_name": "Python 3 (ipykernel)",
			
 
				-   "language": "python",
			
 
				-   "name": "python3"
			
 
				-  },
			
 
				-  "language_info": {
			
 
				-   "codemirror_mode": {
			
 
				-    "name": "ipython",
			
 
				-    "version": 3
			
 
				-   },
			
 
				-   "file_extension": ".py",
			
 
				-   "mimetype": "text/x-python",
			
 
				-   "name": "python",
			
 
				-   "nbconvert_exporter": "python",
			
 
				-   "pygments_lexer": "ipython3",
			
 
				-   "version": "3.8.18"
			
 
				-  }
			
 
				- },
			
 
				- "nbformat": 4,
			
 
				- "nbformat_minor": 5
			
 
				-}
			
--- a/recipes/inference/llama_web_ui/README.md
+++ b/recipes/inference/llama_web_ui/README.md
@@ -1,25 +0,0 @@
 
				-## Quick Web UI for Llama2 Chat
			
 
				-If you prefer to see Llama2 in action in a web UI, instead of the notebooks above, you can try one of the two methods:
			
 
				-
			
 
				-### Running [Streamlit](https://streamlit.io/) with Llama2
			
 
				-Open a Terminal, run the following commands:
			
 
				-```
			
 
				-pip install streamlit langchain replicate
			
 
				-git clone https://github.com/facebookresearch/llama-recipes
			
 
				-cd llama-recipes/llama-demo-apps
			
 
				-```
			
 
				-
			
 
				-Replace the `<your replicate api token>` in `streamlit_llama2.py` with your API token created [here](https://replicate.com/account/api-tokens) - for more info, see the note [above](#replicate_note).
			
 
				-
			
 
				-Then run the command `streamlit run streamlit_llama2.py` and you'll see on your browser the following UI with question and answer - you can enter new text question, click Submit, and see Llama2's answer:
			
 
				-
			
 
				-![](../../../docs/images/llama2-streamlit.png)
			
 
				-![](../../../docs/images/llama2-streamlit2.png)
			
 
				-
			
 
				-### Running [Gradio](https://www.gradio.app/) with Llama2 (using [Replicate](Llama2_Gradio.ipynb) or [OctoAI](../../llama_api_providers/OctoAI_API_examples/Llama2_Gradio.ipynb))
			
 
				-
			
 
				-To see how to query Llama2 and get answers with the Gradio UI both from the notebook and web, just launch the notebook `Llama2_Gradio.ipynb`. For more info, on how to get set up with a token to power these apps, see the note on [Replicate](../../README.md#replicate_note) and [OctoAI](../../README.md##octoai_note).
			
 
				-
			
 
				-Then enter your question, click Submit. You'll see in the notebook or a browser with URL http://127.0.0.1:7860 the following UI:
			
 
				-
			
 
				-![](../../../docs/images/llama2-gradio.png)
			
--- a/recipes/inference/llama_web_ui/requirements.txt
+++ b/recipes/inference/llama_web_ui/requirements.txt
@@ -1,3 +0,0 @@
 
				-streamlit
			
 
				-langchain
			
 
				-replicate
			
--- a/recipes/inference/llama_web_ui/streamlit_llama2.py
+++ b/recipes/inference/llama_web_ui/streamlit_llama2.py
@@ -1,27 +0,0 @@
 
				-# Copyright (c) Meta Platforms, Inc. and affiliates.
			
 
				-# This software may be used and distributed according to the terms of the Llama 2 Community License Agreement.
			
 
				-
			
 
				-# TODO REFACTOR: Convert this to an ipynb notebook
			
 
				-
			
 
				-import streamlit as st
			
 
				-from langchain.llms import Replicate
			
 
				-import os
			
 
				-
			
 
				-st.title("Llama2-powered Streamlit App")
			
 
				-
			
 
				-with st.sidebar:
			
 
				-    os.environ["REPLICATE_API_TOKEN"] = "<your replicate api token>"
			
 
				-
			
 
				-def generate_response(input_text):
			
 
				-    llama2_13b_chat = "meta/llama-2-13b-chat:f4e2de70d66816a838a89eeeb621910adffb0dd0baba3976c96980970978018d"
			
 
				-
			
 
				-    llm = Replicate(
			
 
				-        model=llama2_13b_chat,
			
 
				-        model_kwargs={"temperature": 0.01, "top_p": 1, "max_new_tokens":500}
			
 
				-    )
			
 
				-    st.info(llm(input_text))
			
 
				-
			
 
				-with st.form("my_form"):
			
 
				-    text = st.text_area("Enter text:", "What is Generative AI?")
			
 
				-    submitted = st.form_submit_button("Submit")
			
 
				-    generate_response(text)
			
--- a/recipes/inference/local_inference/README.md
+++ b/recipes/inference/local_inference/README.md
@@ -69,7 +69,7 @@ In case you have fine-tuned your model with pure FSDP and saved the checkpoints
 
				 This is helpful if you have fine-tuned you model using FSDP only as follows:
			
 
				 
			
 
				 ```bash
			
 
				-torchrun --nnodes 1 --nproc_per_node 8  recipes/finetuning/finetuning.py --enable_fsdp --model_name /patht_of_model_folder/7B --dist_checkpoint_root_folder model_checkpoints --dist_checkpoint_folder fine-tuned --pure_bf16
			
 
				+torchrun --nnodes 1 --nproc_per_node 8  recipes/finetuning/finetuning.py --enable_fsdp --model_name /path_of_model_folder/7B --dist_checkpoint_root_folder model_checkpoints --dist_checkpoint_folder fine-tuned --pure_bf16
			
 
				 ```
			
 
				 Then convert your FSDP checkpoint to HuggingFace checkpoints using:
			
 
				 ```bash
			
--- a/recipes/inference/model_servers/README.md
+++ b/recipes/inference/model_servers/README.md
@@ -1,4 +1,2 @@
 
				-## [Running Llama2 On-Prem with vLLM and TGI](llama-on-prem.md)
			
 
				-This tutorial shows how to use Llama 2 with [vLLM](https://github.com/vllm-project/vllm) and Hugging Face [TGI](https://github.com/huggingface/text-generation-inference) to build Llama 2 on-prem apps.
			
 
				-
			
 
				-\* To run a quantized Llama2 model on iOS and Android, you can use  the open source [MLC LLM](https://github.com/mlc-ai/mlc-llm) or [llama.cpp](https://github.com/ggerganov/llama.cpp). You can even make a Linux OS that boots to Llama2 ([repo](https://github.com/trholding/llama2.c)).
			
 
				+## [Running Llama 3 On-Prem with vLLM and TGI](llama-on-prem.md)
			
 
				+This tutorial shows how to use Llama 3 with [vLLM](https://github.com/vllm-project/vllm) and Hugging Face [TGI](https://github.com/huggingface/text-generation-inference) to build Llama 3 on-prem apps.
			
--- a/recipes/use_cases/MediaGen.ipynb
+++ b/recipes/use_cases/MediaGen.ipynb
--- a/recipes/use_cases/README.md
+++ b/recipes/use_cases/README.md
@@ -17,5 +17,7 @@ This step-by-step tutorial shows how to use the [Messenger Platform](https://dev
 
				 A complete example of how to build a Llama 3 chatbot hosted on your browser that can answer questions based on your own data using retrieval augmented generation (RAG). You can run Llama2 locally if you have a good enough GPU or on OctoAI if you follow the note [here](../README.md#octoai_note).
			
 
				 
			
 
				 ## [Sales Bot](./chatbots/sales_bot/SalesBot.ipynb): Sales Bot with Llama3 - A Summarization and RAG Use Case
			
 
				-
			
 
				 An summarization + RAG use case built around the Amazon product review Kaggle dataset to build a helpful Music Store Sales Bot. The summarization and RAG are built on top of Llama models hosted on OctoAI, and the vector database is hosted on Weaviate Cloud Services.
			
 
				+
			
 
				+## [Media Generation](./MediaGen.ipynb): Building a Video Generation Pipeline with Llama3
			
 
				+This step-by-step tutorial shows how to use leverage Llama 3 to drive the generation of animated videos using SDXL and SVD. More specifically it relies on JSON formatting to produce a scene-by-scene story board of a recipe video. The user provides the name of a dish, then Llama 3 describes a step by step guide to reproduce the said dish. This step by step guide is brought to life with models like SDXL and SVD.
			
--- a/recipes/use_cases/llamaindex_cookbook.ipynb
+++ b/recipes/use_cases/llamaindex_cookbook.ipynb
--- a/scripts/spellcheck_conf/wordlist.txt
+++ b/scripts/spellcheck_conf/wordlist.txt
@@ -1317,3 +1317,6 @@ vectorstore
 
				 Kaggle
			
 
				 SalesBot
			
 
				 Weaviate
			
 
				+MediaGen
			
 
				+SDXL
			
 
				+SVD
			
--- a/src/llama_recipes/utils/config_utils.py
+++ b/src/llama_recipes/utils/config_utils.py
@@ -34,7 +34,7 @@ def update_config(config, **kwargs):
 
				                     if hasattr(config, param_name):
			
 
				                         setattr(config, param_name, v)
			
 
				                     else:
			
 
				-                        # In case of specialized config we can warm user
			
 
				+                        # In case of specialized config we can warn user
			
 
				                         print(f"Warning: {config_name} does not accept parameter: {k}")
			
 
				             elif isinstance(config, train_config):
			
 
				                 print(f"Warning: unknown parameter {k}")
			
--- a/src/llama_recipes/utils/train_utils.py
+++ b/src/llama_recipes/utils/train_utils.py
@@ -103,6 +103,8 @@ def train(model, train_dataloader,eval_dataloader, tokenizer, optimizer, lr_sche
 
				     val_loss =[]
			
 
				 
			
 
				     if train_config.save_metrics:
			
 
				+        if not os.path.exists(train_config.output_dir):
			
 
				+            os.makedirs(train_config.output_dir, exist_ok=True)
			
 
				         metrics_filename = f"{train_config.output_dir}/metrics_data_{local_rank}-{datetime.now().strftime('%Y-%m-%d_%H-%M-%S')}.json"
			
 
				         train_step_perplexity = []
			
 
				         train_step_loss = []